Spaces:
Running
Running
add app
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- app.py +361 -0
- configs/dataset/rec/evaluation.yaml +41 -0
- configs/dataset/rec/ltb.yaml +9 -0
- configs/dataset/rec/mjsynth.yaml +11 -0
- configs/dataset/rec/openvino.yaml +25 -0
- configs/dataset/rec/ost.yaml +17 -0
- configs/dataset/rec/synthtext.yaml +7 -0
- configs/dataset/rec/test.yaml +77 -0
- configs/dataset/rec/textocr.yaml +13 -0
- configs/dataset/rec/textocr_horizontal.yaml +13 -0
- configs/dataset/rec/union14m_b.yaml +47 -0
- configs/dataset/rec/union14m_l_filtered.yaml +35 -0
- configs/det/dbnet/repvit_db.yml +171 -0
- configs/rec/abinet/resnet45_trans_abinet_lang.yml +94 -0
- configs/rec/abinet/resnet45_trans_abinet_wo_lang.yml +93 -0
- configs/rec/abinet/svtrv2_abinet_lang.yml +130 -0
- configs/rec/abinet/svtrv2_abinet_wo_lang.yml +128 -0
- configs/rec/aster/resnet31_lstm_aster_tps_on.yml +93 -0
- configs/rec/aster/svtrv2_aster.yml +127 -0
- configs/rec/aster/svtrv2_aster_tps_on.yml +102 -0
- configs/rec/autostr/autostr_lstm_aster_tps_on.yml +95 -0
- configs/rec/busnet/svtrv2_busnet.yml +135 -0
- configs/rec/busnet/svtrv2_busnet_pretraining.yml +134 -0
- configs/rec/busnet/vit_busnet.yml +104 -0
- configs/rec/busnet/vit_busnet_pretraining.yml +104 -0
- configs/rec/cam/convnextv2_cam_tps_on.yml +118 -0
- configs/rec/cam/convnextv2_tiny_cam_tps_on.yml +118 -0
- configs/rec/cam/svtrv2_cam_tps_on.yml +123 -0
- configs/rec/cdistnet/resnet45_trans_cdistnet.yml +93 -0
- configs/rec/cdistnet/svtrv2_cdistnet.yml +139 -0
- configs/rec/cppd/svtr_base_cppd.yml +123 -0
- configs/rec/cppd/svtr_base_cppd_ch.yml +126 -0
- configs/rec/cppd/svtr_base_cppd_h8.yml +123 -0
- configs/rec/cppd/svtr_base_cppd_syn.yml +124 -0
- configs/rec/cppd/svtrv2_cppd.yml +150 -0
- configs/rec/dan/resnet45_fpn_dan.yml +98 -0
- configs/rec/dan/svtrv2_dan.yml +130 -0
- configs/rec/dptr/dptr_parseq_pretrain.yml +88 -0
- configs/rec/focalsvtr/focalsvtr_ctc.yml +137 -0
- configs/rec/gtc/svtrv2_lnconv_nrtr_gtc.yml +168 -0
- configs/rec/gtc/svtrv2_lnconv_smtr_gtc_long_infer.yml +151 -0
- configs/rec/gtc/svtrv2_lnconv_smtr_gtc_smtr_long.yml +150 -0
- configs/rec/gtc/svtrv2_lnconv_smtr_gtc_stream.yml +152 -0
- configs/rec/igtr/readme.md +192 -0
- configs/rec/igtr/svtr_base_ds_igtr.yml +157 -0
- configs/rec/lister/focalsvtr_lister_wo_fem_maxratio12.yml +133 -0
- configs/rec/lister/svtrv2_lister_wo_fem_maxratio12.yml +138 -0
- configs/rec/lpv/svtr_base_lpv.yml +124 -0
- configs/rec/lpv/svtr_base_lpv_wo_glrm.yml +123 -0
- configs/rec/lpv/svtrv2_lpv.yml +147 -0
app.py
ADDED
|
@@ -0,0 +1,361 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import uuid
|
| 3 |
+
import shutil
|
| 4 |
+
import re
|
| 5 |
+
import base64
|
| 6 |
+
import gradio as gr
|
| 7 |
+
from PIL import Image
|
| 8 |
+
|
| 9 |
+
from tools.infer_doc import OpenDoc
|
| 10 |
+
from tools.utils.logging import get_logger
|
| 11 |
+
|
| 12 |
+
logger = get_logger(name='opendoc_gradio')
|
| 13 |
+
|
| 14 |
+
# Initialize the pipeline
|
| 15 |
+
pipeline: OpenDoc | None = None
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def get_pipeline(gpu_id: int) -> OpenDoc:
|
| 19 |
+
"""获取或初始化OpenDoc流水线
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
gpu_id: GPU设备ID,-1表示使用CPU
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
OpenDoc: 初始化好的OpenDoc实例
|
| 26 |
+
"""
|
| 27 |
+
global pipeline
|
| 28 |
+
if pipeline is None:
|
| 29 |
+
logger.info(
|
| 30 |
+
f"Initializing OpenDoc pipeline on {'GPU ' + str(gpu_id) if gpu_id >= 0 else 'CPU'}..."
|
| 31 |
+
)
|
| 32 |
+
pipeline = OpenDoc(gpuId=gpu_id)
|
| 33 |
+
return pipeline
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# Ensure pipeline is initialized
|
| 37 |
+
try:
|
| 38 |
+
current_pipeline = get_pipeline(0)
|
| 39 |
+
except Exception as e:
|
| 40 |
+
raise e
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def process_image(image_path: str | None) -> tuple[Image.Image | None, str, str, str | None, str, str]:
|
| 44 |
+
"""处理图片并进行OCR识别
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
image_path: 图片文件路径,None表示无图片
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
tuple: (可视化图片, Markdown内容(base64图片), JSON内容, ZIP文件路径, 原始Markdown, Markdown内容(base64图片))
|
| 51 |
+
"""
|
| 52 |
+
if image_path is None:
|
| 53 |
+
return None, '', '', None, '', ''
|
| 54 |
+
|
| 55 |
+
# Get original image name
|
| 56 |
+
base_name = os.path.splitext(os.path.basename(image_path))[0]
|
| 57 |
+
file_ext = os.path.splitext(image_path)[1] or '.jpg'
|
| 58 |
+
|
| 59 |
+
# Create a directory with image name for this request
|
| 60 |
+
output_base_dir = 'gradio_outputs'
|
| 61 |
+
os.makedirs(output_base_dir, exist_ok=True)
|
| 62 |
+
|
| 63 |
+
# Add timestamp to avoid conflicts if same filename is uploaded multiple times
|
| 64 |
+
timestamp = str(uuid.uuid4())[:8]
|
| 65 |
+
folder_name = f"{base_name}_{timestamp}"
|
| 66 |
+
tmp_dir = os.path.join(output_base_dir, folder_name)
|
| 67 |
+
os.makedirs(tmp_dir, exist_ok=True)
|
| 68 |
+
|
| 69 |
+
try:
|
| 70 |
+
# Copy and rename the input image
|
| 71 |
+
tmp_img_path = os.path.join(tmp_dir, f'{base_name}{file_ext}')
|
| 72 |
+
image = Image.open(image_path)
|
| 73 |
+
image.save(tmp_img_path)
|
| 74 |
+
|
| 75 |
+
# Predict
|
| 76 |
+
output = list(
|
| 77 |
+
current_pipeline.predict(tmp_img_path,
|
| 78 |
+
use_doc_orientation_classify=False,
|
| 79 |
+
use_doc_unwarping=False))
|
| 80 |
+
if not output:
|
| 81 |
+
return None, 'No results found.', '', None, '', ''
|
| 82 |
+
|
| 83 |
+
res = output[0]
|
| 84 |
+
|
| 85 |
+
# Save results
|
| 86 |
+
res.save_to_img(tmp_dir)
|
| 87 |
+
res.save_to_markdown(tmp_dir, pretty=True)
|
| 88 |
+
res.save_to_json(tmp_dir)
|
| 89 |
+
|
| 90 |
+
# Find the saved files
|
| 91 |
+
vis_img = None
|
| 92 |
+
for f in os.listdir(tmp_dir):
|
| 93 |
+
if 'layout_order_res' in f:
|
| 94 |
+
vis_img_path = os.path.join(tmp_dir, f)
|
| 95 |
+
vis_img = Image.open(vis_img_path)
|
| 96 |
+
break
|
| 97 |
+
|
| 98 |
+
markdown_content = ''
|
| 99 |
+
md_file_path = None
|
| 100 |
+
for f in os.listdir(tmp_dir):
|
| 101 |
+
if f.endswith('.md'):
|
| 102 |
+
md_file_path = os.path.join(tmp_dir, f)
|
| 103 |
+
with open(md_file_path, 'r', encoding='utf-8') as file:
|
| 104 |
+
markdown_content = file.read()
|
| 105 |
+
break
|
| 106 |
+
|
| 107 |
+
# Convert relative image paths to base64 for proper display in Gradio
|
| 108 |
+
if markdown_content:
|
| 109 |
+
|
| 110 |
+
def replace_img_with_base64(match):
|
| 111 |
+
img_path = match.group(1)
|
| 112 |
+
full_img_path = os.path.join(tmp_dir, img_path)
|
| 113 |
+
|
| 114 |
+
if os.path.exists(full_img_path):
|
| 115 |
+
try:
|
| 116 |
+
with open(full_img_path, 'rb') as img_file:
|
| 117 |
+
img_data = base64.b64encode(img_file.read()).decode('utf-8')
|
| 118 |
+
# Determine image format
|
| 119 |
+
ext = os.path.splitext(full_img_path)[1].lower()
|
| 120 |
+
mime_type = 'image/jpeg' if ext in ['.jpg', '.jpeg'] else 'image/png'
|
| 121 |
+
# Replace src with base64 data URL
|
| 122 |
+
return match.group(0).replace(f'src="{img_path}"', f'src="data:{mime_type};base64,{img_data}"')
|
| 123 |
+
except Exception as e:
|
| 124 |
+
logger.warning(f'Failed to convert image {img_path} to base64: {e}')
|
| 125 |
+
return match.group(0)
|
| 126 |
+
|
| 127 |
+
# Find all img tags and replace their src
|
| 128 |
+
markdown_content_show = re.sub(r'<img[^>]*src="([^"]+)"[^>]*>', replace_img_with_base64, markdown_content)
|
| 129 |
+
else:
|
| 130 |
+
markdown_content_show = markdown_content
|
| 131 |
+
|
| 132 |
+
json_content = ''
|
| 133 |
+
json_file_path = None
|
| 134 |
+
for f in os.listdir(tmp_dir):
|
| 135 |
+
if f.endswith('.json'):
|
| 136 |
+
json_file_path = os.path.join(tmp_dir, f)
|
| 137 |
+
with open(json_file_path, 'r', encoding='utf-8') as file:
|
| 138 |
+
json_content = file.read()
|
| 139 |
+
break
|
| 140 |
+
|
| 141 |
+
# Prepare all files in tmp_dir for download by creating a zip archive
|
| 142 |
+
zip_path = os.path.join(output_base_dir, f'{folder_name}.zip')
|
| 143 |
+
_ = shutil.make_archive(zip_path.replace('.zip', ''), 'zip', tmp_dir)
|
| 144 |
+
|
| 145 |
+
return vis_img, markdown_content_show, json_content, zip_path, markdown_content, markdown_content_show
|
| 146 |
+
|
| 147 |
+
except Exception as e:
|
| 148 |
+
logger.error(f'Prediction error: {str(e)}')
|
| 149 |
+
return None, f'Error during prediction: {str(e)}', '', None, '', ''
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
# Custom CSS with adaptive colors
|
| 153 |
+
custom_css = """
|
| 154 |
+
body, .gradio-container {
|
| 155 |
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Noto Sans", Helvetica, Arial, sans-serif;
|
| 156 |
+
}
|
| 157 |
+
.app-header {
|
| 158 |
+
text-align: center;
|
| 159 |
+
max-width: 1200px;
|
| 160 |
+
margin: 20px auto !important;
|
| 161 |
+
padding: 20px;
|
| 162 |
+
}
|
| 163 |
+
.app-header h1 {
|
| 164 |
+
font-size: 2.5em;
|
| 165 |
+
font-weight: 700;
|
| 166 |
+
margin-bottom: 10px;
|
| 167 |
+
}
|
| 168 |
+
.app-header p {
|
| 169 |
+
font-size: 1.1em;
|
| 170 |
+
opacity: 0.7;
|
| 171 |
+
line-height: 1.6;
|
| 172 |
+
}
|
| 173 |
+
.quick-links {
|
| 174 |
+
text-align: center;
|
| 175 |
+
padding: 12px 0;
|
| 176 |
+
border: 1px solid var(--border-color-primary);
|
| 177 |
+
border-radius: 12px;
|
| 178 |
+
margin: 16px auto;
|
| 179 |
+
max-width: 1200px;
|
| 180 |
+
background: var(--background-fill-secondary);
|
| 181 |
+
}
|
| 182 |
+
.quick-links a {
|
| 183 |
+
margin: 0 16px;
|
| 184 |
+
font-size: 15px;
|
| 185 |
+
font-weight: 600;
|
| 186 |
+
color: var(--link-text-color);
|
| 187 |
+
text-decoration: none;
|
| 188 |
+
transition: all 0.3s ease;
|
| 189 |
+
}
|
| 190 |
+
.quick-links a:hover {
|
| 191 |
+
opacity: 0.8;
|
| 192 |
+
text-decoration: underline;
|
| 193 |
+
}
|
| 194 |
+
.upload-section {
|
| 195 |
+
border: 2px dashed var(--border-color-primary);
|
| 196 |
+
border-radius: 12px;
|
| 197 |
+
padding: 20px;
|
| 198 |
+
background: var(--background-fill-secondary);
|
| 199 |
+
transition: all 0.3s ease;
|
| 200 |
+
}
|
| 201 |
+
.upload-section:hover {
|
| 202 |
+
border-color: var(--color-accent);
|
| 203 |
+
background: var(--background-fill-primary);
|
| 204 |
+
}
|
| 205 |
+
#vis_output {
|
| 206 |
+
min-height: 400px;
|
| 207 |
+
border-radius: 12px;
|
| 208 |
+
overflow: hidden;
|
| 209 |
+
}
|
| 210 |
+
#md_preview {
|
| 211 |
+
max-height: 600px;
|
| 212 |
+
min-height: 200px;
|
| 213 |
+
overflow: auto;
|
| 214 |
+
padding: 20px;
|
| 215 |
+
background: var(--background-fill-primary);
|
| 216 |
+
border-radius: 12px;
|
| 217 |
+
box-shadow: var(--shadow-drop);
|
| 218 |
+
}
|
| 219 |
+
#md_preview img {
|
| 220 |
+
display: block;
|
| 221 |
+
margin: 16px auto;
|
| 222 |
+
max-width: 100%;
|
| 223 |
+
height: auto;
|
| 224 |
+
border-radius: 8px;
|
| 225 |
+
}
|
| 226 |
+
.notice {
|
| 227 |
+
margin: 20px auto;
|
| 228 |
+
max-width: 1200px;
|
| 229 |
+
padding: 16px 20px;
|
| 230 |
+
border-left: 4px solid var(--color-accent);
|
| 231 |
+
border-radius: 8px;
|
| 232 |
+
background: var(--background-fill-secondary);
|
| 233 |
+
font-size: 14px;
|
| 234 |
+
line-height: 1.8;
|
| 235 |
+
}
|
| 236 |
+
.notice strong {
|
| 237 |
+
font-weight: 700;
|
| 238 |
+
color: var(--color-accent);
|
| 239 |
+
}
|
| 240 |
+
.notice ul {
|
| 241 |
+
margin-top: 8px;
|
| 242 |
+
padding-left: 20px;
|
| 243 |
+
}
|
| 244 |
+
.notice li {
|
| 245 |
+
margin: 8px 0;
|
| 246 |
+
}
|
| 247 |
+
.gradio-button-primary {
|
| 248 |
+
font-weight: 600 !important;
|
| 249 |
+
transition: all 0.3s ease !important;
|
| 250 |
+
}
|
| 251 |
+
.gradio-button-primary:hover {
|
| 252 |
+
transform: translateY(-2px);
|
| 253 |
+
box-shadow: var(--shadow-drop-lg) !important;
|
| 254 |
+
}
|
| 255 |
+
"""
|
| 256 |
+
|
| 257 |
+
# LaTeX delimiters for formula rendering
|
| 258 |
+
LATEX_DELIMS = [
|
| 259 |
+
{"left": "$$", "right": "$$", "display": True},
|
| 260 |
+
{"left": "$", "right": "$", "display": False},
|
| 261 |
+
{"left": "\\(", "right": "\\)", "display": False},
|
| 262 |
+
{"left": "\\[", "right": "\\]", "display": True},
|
| 263 |
+
]
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
# Define the Gradio Interface
|
| 267 |
+
def create_demo() -> gr.Blocks:
|
| 268 |
+
"""创建Gradio演示界面
|
| 269 |
+
|
| 270 |
+
Returns:
|
| 271 |
+
gr.Blocks: Gradio Blocks应用实例
|
| 272 |
+
"""
|
| 273 |
+
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title='OpenDoc-0.1B Demo') as demo:
|
| 274 |
+
# Header
|
| 275 |
+
gr.HTML("""
|
| 276 |
+
<div class="app-header">
|
| 277 |
+
<h1>🚀 OpenDoc-0.1B</h1>
|
| 278 |
+
<p>Ultra-Lightweight Document Parsing System with 0.1B Parameters (built by <a href="https://github.com/Topdu/OpenOCR">OCR Team</a>, <a href="https://fvl.fudan.edu.cn">FVL Lab</a>)</p>
|
| 279 |
+
<p style="font-size: 0.95em; color: #888;">
|
| 280 |
+
Powered by <a href="https://www.paddleocr.ai/latest/version3.x/module_usage/layout_analysis.html" target="_blank">PP-DocLayoutV2</a> for layout analysis and <a href="https://arxiv.org/pdf/2512.21095" target="_blank">UniRec-0.1B</a> for unified recognition of text, formulas, and tables
|
| 281 |
+
</p>
|
| 282 |
+
</div>
|
| 283 |
+
""")
|
| 284 |
+
|
| 285 |
+
# Quick links
|
| 286 |
+
gr.HTML("""
|
| 287 |
+
<div class="quick-links">
|
| 288 |
+
<a href="https://github.com/Topdu/OpenOCR" target="_blank">📖 GitHub</a>
|
| 289 |
+
<a href="https://arxiv.org/pdf/2512.21095" target="_blank">📄 Paper</a>
|
| 290 |
+
<a href="https://huggingface.co/topdu/unirec-0.1b" target="_blank">🤗 Model</a>
|
| 291 |
+
</div>
|
| 292 |
+
""")
|
| 293 |
+
|
| 294 |
+
with gr.Row():
|
| 295 |
+
with gr.Column(scale=5, elem_classes=["upload-section"]):
|
| 296 |
+
input_img = gr.Image(type='filepath', label='📤 Upload Document Image', height=400)
|
| 297 |
+
|
| 298 |
+
gr.Markdown("""
|
| 299 |
+
### 💡 Tips
|
| 300 |
+
- Supports Chinese and English documents
|
| 301 |
+
- Best for reports, papers, magazines, and complex layouts
|
| 302 |
+
- Handles text, formulas, tables, and images
|
| 303 |
+
""")
|
| 304 |
+
|
| 305 |
+
btn = gr.Button('🔍 Analyze Document', variant='primary', size='lg')
|
| 306 |
+
download_output = gr.File(label='📥 Download All Results (ZIP)', visible=True)
|
| 307 |
+
|
| 308 |
+
with gr.Column(scale=7):
|
| 309 |
+
with gr.Tabs():
|
| 310 |
+
with gr.Tab('📝 Markdown Preview'):
|
| 311 |
+
output_md = gr.Markdown(
|
| 312 |
+
'Please upload an image and click "Analyze Document" to see results.',
|
| 313 |
+
latex_delimiters=LATEX_DELIMS,
|
| 314 |
+
elem_id='md_preview'
|
| 315 |
+
)
|
| 316 |
+
with gr.Tab('📊 Layout Visualization'):
|
| 317 |
+
output_vis = gr.Image(type='pil', label='Layout Analysis Results', elem_id='vis_output')
|
| 318 |
+
|
| 319 |
+
with gr.Tab('📄 Raw Markdown'):
|
| 320 |
+
output_md_raw = gr.Code(
|
| 321 |
+
label='Markdown Source',
|
| 322 |
+
language='markdown',
|
| 323 |
+
lines=20
|
| 324 |
+
)
|
| 325 |
+
with gr.Tab('📄 Raw Markdown with Base64 Images'):
|
| 326 |
+
output_md_raw_with_base64 = gr.Code(
|
| 327 |
+
label='Markdown Source',
|
| 328 |
+
language='markdown',
|
| 329 |
+
lines=20
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
+
with gr.Tab('🗂️ JSON Result'):
|
| 333 |
+
output_json = gr.Code(label='Structured Data', language='json')
|
| 334 |
+
|
| 335 |
+
# Feature notice
|
| 336 |
+
gr.HTML("""
|
| 337 |
+
<div class="notice">
|
| 338 |
+
<strong>✨ Key Features:</strong>
|
| 339 |
+
<ul>
|
| 340 |
+
<li><strong>Ultra-lightweight:</strong> Only 0.1B parameters, fast inference speed</li>
|
| 341 |
+
<li><strong>High accuracy:</strong> Achieves 90.57% on OmniDocBench (v1.5)</li>
|
| 342 |
+
<li><strong>Unified recognition:</strong> Handles text, formulas, and tables in one model</li>
|
| 343 |
+
<li><strong>Rich output:</strong> Provides Markdown, JSON, and visualization results</li>
|
| 344 |
+
</ul>
|
| 345 |
+
</div>
|
| 346 |
+
""")
|
| 347 |
+
|
| 348 |
+
btn.click(
|
| 349 |
+
fn=process_image,
|
| 350 |
+
inputs=[input_img],
|
| 351 |
+
outputs=[output_vis, output_md, output_json, download_output, output_md_raw, output_md_raw_with_base64]
|
| 352 |
+
)
|
| 353 |
+
|
| 354 |
+
return demo
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
if __name__ == '__main__':
|
| 358 |
+
demo = create_demo()
|
| 359 |
+
demo.queue(max_size=20).launch(
|
| 360 |
+
share=False
|
| 361 |
+
)
|
configs/dataset/rec/evaluation.yaml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
root: ../evaluation
|
| 2 |
+
task: str
|
| 3 |
+
download_links:
|
| 4 |
+
# IC15_1811
|
| 5 |
+
- https://drive.usercontent.google.com/download?id=1eGY0kXNV1qVxeUpoGzs-ioUO-ky7msH6&authuser=0&confirm=t
|
| 6 |
+
- https://drive.usercontent.google.com/download?id=1BWv7aLoLAT7avY326gXP3GJF48UZpuBC&authuser=0&confirm=t
|
| 7 |
+
# SVT
|
| 8 |
+
- https://drive.usercontent.google.com/download?id=1ecEZ4cJ7dIbTCZRltE0s5KzUotQWagH-&authuser=0&confirm=t
|
| 9 |
+
- https://drive.usercontent.google.com/download?id=1OygBP7i9R-3Pwi6WodCcW31J8CUMugOJ&authuser=0&confirm=t
|
| 10 |
+
# IIIT5k
|
| 11 |
+
- https://drive.usercontent.google.com/download?id=1PJ9_IvIGZTS5hHdGLnpKuYKZcCO8jE0E&authuser=0&confirm=t
|
| 12 |
+
- https://drive.usercontent.google.com/download?id=10P3MixSBt1v8k8_6aFfziC33Z5IlM6Uf&authuser=0&confirm=t
|
| 13 |
+
# IC13_857
|
| 14 |
+
- https://drive.usercontent.google.com/download?id=1-wMHOFBXJaOaY-UD00nDn6qw2s_8R4Vd&authuser=0&confirm=t
|
| 15 |
+
- https://drive.usercontent.google.com/download?id=1J1QCFtOFxFKiLJIgTqZ6eRo9Y5QGqHpA&authuser=0&confirm=t
|
| 16 |
+
# SVTP
|
| 17 |
+
- https://drive.usercontent.google.com/download?id=1kckwfZkdaHG8k_FW5IIJKUaYZkF21Hza&authuser=0&confirm=t
|
| 18 |
+
- https://drive.usercontent.google.com/download?id=1x61lm_ea7lvIdxNPMG-jy-5W0MxtdH0N&authuser=0&confirm=t
|
| 19 |
+
# CUTE80
|
| 20 |
+
- https://drive.usercontent.google.com/download?id=1Zv_91c81tinLy5Je89HPr-5wUSnqXKIB&authuser=0&confirm=t
|
| 21 |
+
- https://drive.usercontent.google.com/download?id=1OuJ6QoJ9AlyNHIM9j2WedAPxTnac7kyY&authuser=0&confirm=t
|
| 22 |
+
filenames:
|
| 23 |
+
# IC15_1811
|
| 24 |
+
- ../evaluation/IC15_1811/data.mdb
|
| 25 |
+
- ../evaluation/IC15_1811/lock.mdb
|
| 26 |
+
# SVT
|
| 27 |
+
- ../evaluation/SVT/data.mdb
|
| 28 |
+
- ../evaluation/SVT/lock.mdb
|
| 29 |
+
# IIIT5k
|
| 30 |
+
- ../evaluation/IIIT5k/data.mdb
|
| 31 |
+
- ../evaluation/IIIT5k/lock.mdb
|
| 32 |
+
# IC13_857
|
| 33 |
+
- ../evaluation/IC13_857/data.mdb
|
| 34 |
+
- ../evaluation/IC13_857/lock.mdb
|
| 35 |
+
# SVTP
|
| 36 |
+
- ../evaluation/SVTP/data.mdb
|
| 37 |
+
- ../evaluation/SVTP/lock.mdb
|
| 38 |
+
# CUTE80
|
| 39 |
+
- ../evaluation/CUTE80/data.mdb
|
| 40 |
+
- ../evaluation/CUTE80/lock.mdb
|
| 41 |
+
check_validity: true
|
configs/dataset/rec/ltb.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
root: ../ltb
|
| 2 |
+
task: str
|
| 3 |
+
download_links:
|
| 4 |
+
- https://drive.usercontent.google.com/download?id=16AEA1YGTsyVB44uEjKi4ZUV1snjCYBr4&authuser=0&confirm=t
|
| 5 |
+
- https://drive.usercontent.google.com/download?id=1xU4OStrOaI23bPG4flWAPWn2YrQe2bmY&authuser=0&confirm=t
|
| 6 |
+
filenames:
|
| 7 |
+
- ../ltb/data.mdb
|
| 8 |
+
- ../ltb/lock.mdb
|
| 9 |
+
check_validity: true
|
configs/dataset/rec/mjsynth.yaml
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
root: ../synth
|
| 2 |
+
task: str
|
| 3 |
+
download_links:
|
| 4 |
+
- https://drive.usercontent.google.com/download?id=1FIoplSFZ-BKQoRDHDXsVMKa844e-K8PD&authuser=0&confirm=t
|
| 5 |
+
- https://drive.usercontent.google.com/download?id=1eckTvaeRtlTZvbO2orrVz-cIuIk6i87K&authuser=0&confirm=t
|
| 6 |
+
- https://drive.usercontent.google.com/download?id=1PBXTf-2PnmEvJBsqzJqxxRwzhAZGTiMG&authuser=0&confirm=t
|
| 7 |
+
filenames:
|
| 8 |
+
- ../synth/MJ_train.zip
|
| 9 |
+
- ../synth/MJ_val.zip
|
| 10 |
+
- ../synth/MJ_test.zip
|
| 11 |
+
check_validity: true
|
configs/dataset/rec/openvino.yaml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
root: ../OpenVINO
|
| 2 |
+
task: str
|
| 3 |
+
download_links:
|
| 4 |
+
# train_1
|
| 5 |
+
- https://drive.usercontent.google.com/download?id=1q23QAIRTyG0t-bBm4aAwRwiqB6VUfphw&authuser=0&confirm=
|
| 6 |
+
# train_2
|
| 7 |
+
- https://drive.usercontent.google.com/download?id=1AtbaJljM68cbZqi5lcM92d9VkQUCbSqI&authuser=0&confirm=
|
| 8 |
+
# train_5
|
| 9 |
+
- https://drive.usercontent.google.com/download?id=1dejstYnJ8_sESuO_uvwi__jT1B8gPxf3&authuser=0&confirm=t
|
| 10 |
+
# train_f
|
| 11 |
+
- https://drive.usercontent.google.com/download?id=1C4akchTc7-yi1OS_sJ3KP693UKcnecke&authuser=0&confirm=t
|
| 12 |
+
# validation
|
| 13 |
+
- https://drive.usercontent.google.com/download?id=17TRzSQhuK_juAxAv3KmX0y13pQP2cz6R&authuser=0&confirm=t
|
| 14 |
+
filenames:
|
| 15 |
+
# train_1
|
| 16 |
+
- ../OpenVINO/train_1.zip
|
| 17 |
+
# train_2
|
| 18 |
+
- ../OpenVINO/train_2.zip
|
| 19 |
+
# train_5
|
| 20 |
+
- ../OpenVINO/train_5.zip
|
| 21 |
+
# train_f
|
| 22 |
+
- ../OpenVINO/train_f.zip
|
| 23 |
+
# validation
|
| 24 |
+
- ../OpenVINO/validation.zip
|
| 25 |
+
check_validity: true
|
configs/dataset/rec/ost.yaml
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
root: ../OST
|
| 2 |
+
task: str
|
| 3 |
+
download_links:
|
| 4 |
+
# OST heavy
|
| 5 |
+
- https://drive.usercontent.google.com/download?id=1RGpIFbD_SRlrzZFBoVF_LGvetNx1-5pg&authuser=0&confirm=t
|
| 6 |
+
- https://drive.usercontent.google.com/download?id=1Th4MfDf44k0EBpIqCLqVoGRu6G-FP1hq&authuser=0&confirm=t
|
| 7 |
+
# OST weak
|
| 8 |
+
- https://drive.usercontent.google.com/download?id=1z5CTDJucUnvALG12Q4UXk1DDKJDd8WJn&authuser=0&confirm=t
|
| 9 |
+
- https://drive.usercontent.google.com/download?id=1V17TTkX3sjpV7v0km_F2SDCK0tL3k_ls&authuser=0&confirm=t
|
| 10 |
+
filenames:
|
| 11 |
+
# OST heavy
|
| 12 |
+
- ../OST/heavy/data.mdb
|
| 13 |
+
- ../OST/heavy/lock.mdb
|
| 14 |
+
# OST weak
|
| 15 |
+
- ../OST/weak/data.mdb
|
| 16 |
+
- ../OST/weak/lock.mdb
|
| 17 |
+
check_validity: true
|
configs/dataset/rec/synthtext.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
root: ../synth
|
| 2 |
+
task: str
|
| 3 |
+
download_links:
|
| 4 |
+
- https://drive.usercontent.google.com/download?id=1T-enqkq6_l2HqrsV3da_h0oJ7CUKu_oc&authuser=0&confirm=t
|
| 5 |
+
filenames:
|
| 6 |
+
- ../synth/ST.zip
|
| 7 |
+
check_validity: true
|
configs/dataset/rec/test.yaml
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
root: ../test
|
| 2 |
+
task: str
|
| 3 |
+
download_links:
|
| 4 |
+
# IC13_857
|
| 5 |
+
- https://drive.usercontent.google.com/download?id=1PZSCbe6_DI8MlCqCRWXGT2PP92_frIXq&authuser=0&confirm=t
|
| 6 |
+
- https://drive.usercontent.google.com/download?id=1qkN7NDg0zUHxUiZHAeEatDTqlsgpFWp3&authuser=0&confirm=t
|
| 7 |
+
# IC15_2077
|
| 8 |
+
- https://drive.usercontent.google.com/download?id=1dFkY3DNbr-Mepn3TWBiA9COEJ63fGFcp&authuser=0&confirm=t
|
| 9 |
+
- https://drive.usercontent.google.com/download?id=1UvVwLNZ3tS1YdTBa8MulPzjeVezKaDro&authuser=0&confirm=t
|
| 10 |
+
# SVTP
|
| 11 |
+
- https://drive.usercontent.google.com/download?id=1aofeerilxJ7J3S7QxuCEXbmXTpz8Xshx&authuser=0&confirm=t
|
| 12 |
+
- https://drive.usercontent.google.com/download?id=1rJ1KoO4K_VUxEAUN_bMgBGzK8_JZAAno&authuser=0&confirm=t
|
| 13 |
+
# IIIT5k
|
| 14 |
+
- https://drive.usercontent.google.com/download?id=1XFO2M1Kbgwv3-iTNTmhQXAEjNmKYOeoT&authuser=0&confirm=t
|
| 15 |
+
- https://drive.usercontent.google.com/download?id=1stwK2hFsyaV7HHsEG9EYgnUQebNb2_nG&authuser=0&confirm=t
|
| 16 |
+
# COCOv1.4
|
| 17 |
+
- https://drive.usercontent.google.com/download?id=1Se2QSGS19xx7Gfy-SUdX9mlAOr2eYsfA&authuser=0&confirm=t
|
| 18 |
+
- https://drive.usercontent.google.com/download?id=1xvekFi389QfkH7yS0JIVV0QzjhUspjDv&authuser=0&confirm=t
|
| 19 |
+
# IC15_1811
|
| 20 |
+
- https://drive.usercontent.google.com/download?id=1pHsw8wrThD9EGEE6AusQLZozefSj4iyR&authuser=0&confirm=t
|
| 21 |
+
- https://drive.usercontent.google.com/download?id=1TXZ1qHuKAksaAlvd3qMv4IHKnN-IJW9a&authuser=0&confirm=t
|
| 22 |
+
# Uber
|
| 23 |
+
- https://drive.usercontent.google.com/download?id=1L2j6BZeLTGQ1FIl8HB_D3AFiWLltGV5r&authuser=0&confirm=t
|
| 24 |
+
- https://drive.usercontent.google.com/download?id=12DUj28yzLWxFO_gfMfSjTkRujYD5MNEE&authuser=0&confirm=t
|
| 25 |
+
# IC13_1095
|
| 26 |
+
- https://drive.usercontent.google.com/download?id=1fu8onMt3Z6fDLNAiHcm-sQ2qCXduE-FU&authuser=0&confirm=t
|
| 27 |
+
- https://drive.usercontent.google.com/download?id=1OQAZtLj8U2Cl4L0ErGFsz6vGIVTTWasD&authuser=0&confirm=t
|
| 28 |
+
# IC13_1015
|
| 29 |
+
- https://drive.usercontent.google.com/download?id=1mbsfuvWB282HYfn9tbqcj1nUDkLXcSNB&authuser=0&confirm=t
|
| 30 |
+
- https://drive.usercontent.google.com/download?id=1QGogU_hV-oN7iY2POutdD2LDcmK6plnV&authuser=0&confirm=t
|
| 31 |
+
# ArT
|
| 32 |
+
- https://drive.usercontent.google.com/download?id=1-53knSy-uTSngCG7wyBngVyTuTCmdnWl&authuser=0&confirm=t
|
| 33 |
+
- https://drive.usercontent.google.com/download?id=172EsSaf7BVaB1ORtohi-Jc_8SuUKZGGf&authuser=0&confirm=t
|
| 34 |
+
# SVT
|
| 35 |
+
- https://drive.usercontent.google.com/download?id=1p7aVUr9Yr7c4X4YUBvk2-YP28rraHjn9&authuser=0&confirm=t
|
| 36 |
+
- https://drive.usercontent.google.com/download?id=1ALmhvSleZ0yf-lcdbQPP3M9Zc3oqnXij&authuser=0&confirm=t
|
| 37 |
+
# CUTE80
|
| 38 |
+
- https://drive.usercontent.google.com/download?id=1Ujr4axHKnu54P2rIGUhkjdM6XlhDYrI_&authuser=0&confirm=t
|
| 39 |
+
- https://drive.usercontent.google.com/download?id=1DvZi9L3MqjO2zRUyCg3YvP4qMAt2bsme&authuser=0&confirm=t
|
| 40 |
+
filenames:
|
| 41 |
+
# IC13_857
|
| 42 |
+
- ../test/IC13_857/data.mdb
|
| 43 |
+
- ../test/IC13_857/lock.mdb
|
| 44 |
+
# IC15_2077
|
| 45 |
+
- ../test/IC15_2077/data.mdb
|
| 46 |
+
- ../test/IC15_2077/lock.mdb
|
| 47 |
+
# SVTP
|
| 48 |
+
- ../test/SVTP/data.mdb
|
| 49 |
+
- ../test/SVTP/lock.mdb
|
| 50 |
+
# IIIT5k
|
| 51 |
+
- ../test/IIIT5k/data.mdb
|
| 52 |
+
- ../test/IIIT5k/lock.mdb
|
| 53 |
+
# COCOv1.4
|
| 54 |
+
- ../test/COCOv1.4/data.mdb
|
| 55 |
+
- ../test/COCOv1.4/lock.mdb
|
| 56 |
+
# IC15_1811
|
| 57 |
+
- ../test/IC15_1811/data.mdb
|
| 58 |
+
- ../test/IC15_1811/lock.mdb
|
| 59 |
+
# Uber
|
| 60 |
+
- ../test/Uber/data.mdb
|
| 61 |
+
- ../test/Uber/lock.mdb
|
| 62 |
+
# IC13_1095
|
| 63 |
+
- ../test/IC13_1095/data.mdb
|
| 64 |
+
- ../test/IC13_1095/lock.mdb
|
| 65 |
+
# IC13_1015
|
| 66 |
+
- ../test/IC13_1015/data.mdb
|
| 67 |
+
- ../test/IC13_1015/lock.mdb
|
| 68 |
+
# ArT
|
| 69 |
+
- ../test/ArT/data.mdb
|
| 70 |
+
- ../test/ArT/lock.mdb
|
| 71 |
+
# SVT
|
| 72 |
+
- ../test/SVT/data.mdb
|
| 73 |
+
- ../test/SVT/lock.mdb
|
| 74 |
+
# CUTE80
|
| 75 |
+
- ../test/CUTE80/data.mdb
|
| 76 |
+
- ../test/CUTE80/lock.mdb
|
| 77 |
+
check_validity: true
|
configs/dataset/rec/textocr.yaml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
root: ../TextOCR
|
| 2 |
+
task: str
|
| 3 |
+
download_links:
|
| 4 |
+
# train
|
| 5 |
+
- https://drive.usercontent.google.com/download?id=1jVjJFno4pnsU0Cp_kn4MIXQrChmELy92&authuser=0&confirm=
|
| 6 |
+
# val
|
| 7 |
+
- https://drive.usercontent.google.com/download?id=1ubIRu01MXIek6OvInu-XjaIbw6277-vw&authuser=0&confirm=t
|
| 8 |
+
filenames:
|
| 9 |
+
# train
|
| 10 |
+
- ../TextOCR/train.zip
|
| 11 |
+
# val
|
| 12 |
+
- ../TextOCR/val.zip
|
| 13 |
+
check_validity: true
|
configs/dataset/rec/textocr_horizontal.yaml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
root: ../TextOCR_horizontal
|
| 2 |
+
task: str
|
| 3 |
+
download_links:
|
| 4 |
+
# train
|
| 5 |
+
- https://drive.usercontent.google.com/download?id=1sWH6J11xbjQb8SH7fdG_8mIKVI81ZQy5&authuser=0&confirm=
|
| 6 |
+
# val
|
| 7 |
+
- https://drive.usercontent.google.com/download?id=1gIE-AU2o-5hvg288-bjphO6UkI5AEQ2d&authuser=0&confirm=t
|
| 8 |
+
filenames:
|
| 9 |
+
# train
|
| 10 |
+
- ../TextOCR_horizontal/train.zip
|
| 11 |
+
# val
|
| 12 |
+
- ../TextOCR_horizontal/val.zip
|
| 13 |
+
check_validity: true
|
configs/dataset/rec/union14m_b.yaml
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
root: ../u14m
|
| 2 |
+
task: str
|
| 3 |
+
download_links:
|
| 4 |
+
# artistic
|
| 5 |
+
- https://drive.usercontent.google.com/download?id=1Je2DTuFHnkXDI99yDnm9Anl5naWaCQwd&authuser=0&confirm=t
|
| 6 |
+
- https://drive.usercontent.google.com/download?id=1xtT_Q0juBJUIvAG55qBxoVNNTECd2usZ&authuser=0&confirm=t
|
| 7 |
+
# contextless
|
| 8 |
+
- https://drive.usercontent.google.com/download?id=1_0OzyzWhZOmGrHkayFTVrzhrQrNRDRPR&authuser=0&confirm=t
|
| 9 |
+
- https://drive.usercontent.google.com/download?id=1PPgC42y3xoM9bR0HQFbDYbcT3PzMdD_y&authuser=0&confirm=t
|
| 10 |
+
# salient
|
| 11 |
+
- https://drive.usercontent.google.com/download?id=1tHLMYBmTqRnxvFOTT3dfLfQiundqFWfd&authuser=0&confirm=t
|
| 12 |
+
- https://drive.usercontent.google.com/download?id=13NQgpAtCK0kh9M5E2pAUmKKEp6Qu5Xwj&authuser=0&confirm=t
|
| 13 |
+
# multi_words
|
| 14 |
+
- https://drive.usercontent.google.com/download?id=1IlnDKX3V_Vp9gsDGFB0xoqsVLH1vtxUI&authuser=0&confirm=t
|
| 15 |
+
- https://drive.usercontent.google.com/download?id=1mFFjC7C0CwevvkwFU9YeVbZBdps_3Qpb&authuser=0&confirm=t
|
| 16 |
+
# curve
|
| 17 |
+
- https://drive.usercontent.google.com/download?id=1MxhMd85cmhUtI2lmtXhZQuFk7lav0_fw&authuser=0&confirm=t
|
| 18 |
+
- https://drive.usercontent.google.com/download?id=1N03g-4e-kJG2mRvlM0c5TrwWAkd-iG-Q&authuser=0&confirm=t
|
| 19 |
+
# general
|
| 20 |
+
- https://drive.usercontent.google.com/download?id=1Oqt7OaycP466NWoDmoJ3FqS8YP3YRgvu&authuser=0&confirm=t
|
| 21 |
+
- https://drive.usercontent.google.com/download?id=1K0MrX5eYNt8IIGFHXCwg0_oI5OF5PPFO&authuser=0&confirm=t
|
| 22 |
+
# multi_oriented
|
| 23 |
+
- https://drive.usercontent.google.com/download?id=1TKZFcZPVk0ThqfF-AGhJk_OCLg0ykKbv&authuser=0&confirm=t
|
| 24 |
+
- https://drive.usercontent.google.com/download?id=1PAoLMUWuR7O2-7XRoKkNzQcSiznErQzD&authuser=0&confirm=t
|
| 25 |
+
filenames:
|
| 26 |
+
# artistic
|
| 27 |
+
- ../u14m/artistic/data.mdb
|
| 28 |
+
- ../u14m/artistic/lock.mdb
|
| 29 |
+
# contextless
|
| 30 |
+
- ../u14m/contextless/data.mdb
|
| 31 |
+
- ../u14m/contextless/lock.mdb
|
| 32 |
+
# salient
|
| 33 |
+
- ../u14m/salient/data.mdb
|
| 34 |
+
- ../u14m/salient/lock.mdb
|
| 35 |
+
# multi_words
|
| 36 |
+
- ../u14m/multi_words/data.mdb
|
| 37 |
+
- ../u14m/multi_words/lock.mdb
|
| 38 |
+
# curve
|
| 39 |
+
- ../u14m/curve/data.mdb
|
| 40 |
+
- ../u14m/curve/lock.mdb
|
| 41 |
+
# general
|
| 42 |
+
- ../u14m/general/data.mdb
|
| 43 |
+
- ../u14m/general/lock.mdb
|
| 44 |
+
# multi_oriented
|
| 45 |
+
- ../u14m/multi_oriented/data.mdb
|
| 46 |
+
- ../u14m/multi_oriented/lock.mdb
|
| 47 |
+
check_validity: true
|
configs/dataset/rec/union14m_l_filtered.yaml
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
root: ../Union14M-L-LMDB-Filtered
|
| 2 |
+
task: str
|
| 3 |
+
download_links:
|
| 4 |
+
# train_challenging
|
| 5 |
+
- https://drive.usercontent.google.com/download?id=1etwzBgGHjsFsb0sygsaRnKbanW2PMe07&authuser=0&confirm=t
|
| 6 |
+
- https://drive.usercontent.google.com/download?id=1ly6FJfPjItwGlVQ-ifTrzzM3rVu3Ezhr&authuser=0&confirm=t
|
| 7 |
+
# train_easy
|
| 8 |
+
- https://drive.usercontent.google.com/download?id=1_zeNluTnywIaa5h3PN-Ah9tKyByypot7&authuser=0&confirm=t
|
| 9 |
+
- https://drive.usercontent.google.com/download?id=1caYLeQHDidXgVBDi9IWXbO1gg__DYq9a&authuser=0&confirm=t
|
| 10 |
+
# train_hard
|
| 11 |
+
- https://drive.usercontent.google.com/download?id=1eP6s2xyYPZX9gykvWA4VSOc3Fqul_UB_&authuser=0&confirm=t
|
| 12 |
+
- https://drive.usercontent.google.com/download?id=1-ZlCvocX8P5uVRclUXp_5DNGLDzd16EO&authuser=0&confirm=t
|
| 13 |
+
# train_medium
|
| 14 |
+
- https://drive.usercontent.google.com/download?id=1s_CoaLNJEr-UxHYiqZ5jOcliMCFiRUUy&authuser=0&confirm=t
|
| 15 |
+
- https://drive.usercontent.google.com/download?id=1Wpj6WVpZ5Ily77kVwfQ18CiZBzkgmEnF&authuser=0&confirm=t
|
| 16 |
+
# train_normal
|
| 17 |
+
- https://drive.usercontent.google.com/download?id=1jPt44arlAswl9cXZjzmVcdpptdTPpJ3I&authuser=0&confirm=t
|
| 18 |
+
- https://drive.usercontent.google.com/download?id=1Rfc5kE03AzOUv7B_eYcBhUV8KMQ2MZ1m&authuser=0&confirm=t
|
| 19 |
+
filenames:
|
| 20 |
+
# train_challenging
|
| 21 |
+
- ../Union14M-L-LMDB-Filtered/train_challenging/data.mdb
|
| 22 |
+
- ../Union14M-L-LMDB-Filtered/train_challenging/lock.mdb
|
| 23 |
+
# train_easy
|
| 24 |
+
- ../Union14M-L-LMDB-Filtered/train_easy/data.mdb
|
| 25 |
+
- ../Union14M-L-LMDB-Filtered/train_easy/lock.mdb
|
| 26 |
+
# train_hard
|
| 27 |
+
- ../Union14M-L-LMDB-Filtered/train_hard/data.mdb
|
| 28 |
+
- ../Union14M-L-LMDB-Filtered/train_hard/lock.mdb
|
| 29 |
+
# train_medium
|
| 30 |
+
- ../Union14M-L-LMDB-Filtered/train_medium/data.mdb
|
| 31 |
+
- ../Union14M-L-LMDB-Filtered/train_medium/lock.mdb
|
| 32 |
+
# train_normal
|
| 33 |
+
- ../Union14M-L-LMDB-Filtered/train_normal/data.mdb
|
| 34 |
+
- ../Union14M-L-LMDB-Filtered/train_normal/lock.mdb
|
| 35 |
+
check_validity: true
|
configs/det/dbnet/repvit_db.yml
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: &epoch_num 500
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 100
|
| 6 |
+
output_dir: ./output/det_repsvtr_db
|
| 7 |
+
save_epoch_step: [400, 25]
|
| 8 |
+
eval_batch_step:
|
| 9 |
+
- 0
|
| 10 |
+
- 1000
|
| 11 |
+
cal_metric_during_train: false
|
| 12 |
+
checkpoints:
|
| 13 |
+
pretrained_model: openocr_det_repvit_ch.pth
|
| 14 |
+
save_inference_dir: null
|
| 15 |
+
use_tensorboard: false
|
| 16 |
+
infer_img:
|
| 17 |
+
save_res_path: ./checkpoints/det_db/predicts_db.txt
|
| 18 |
+
distributed: true
|
| 19 |
+
model_type: det
|
| 20 |
+
|
| 21 |
+
Architecture:
|
| 22 |
+
algorithm: DB_mobile
|
| 23 |
+
Backbone:
|
| 24 |
+
name: RepSVTR_det
|
| 25 |
+
Neck:
|
| 26 |
+
name: RSEFPN
|
| 27 |
+
out_channels: 96
|
| 28 |
+
shortcut: True
|
| 29 |
+
Head:
|
| 30 |
+
name: DBHead
|
| 31 |
+
k: 50
|
| 32 |
+
|
| 33 |
+
Loss:
|
| 34 |
+
name: DBLoss
|
| 35 |
+
balance_loss: true
|
| 36 |
+
main_loss_type: DiceLoss
|
| 37 |
+
alpha: 5
|
| 38 |
+
beta: 10
|
| 39 |
+
ohem_ratio: 3
|
| 40 |
+
|
| 41 |
+
Optimizer:
|
| 42 |
+
name: Adam
|
| 43 |
+
lr: 0.001
|
| 44 |
+
weight_decay: 5.0e-05
|
| 45 |
+
filter_bias_and_bn: False
|
| 46 |
+
|
| 47 |
+
LRScheduler:
|
| 48 |
+
name: CosineAnnealingLR
|
| 49 |
+
warmup_epoch: 2
|
| 50 |
+
|
| 51 |
+
PostProcess:
|
| 52 |
+
name: DBPostProcess
|
| 53 |
+
thresh: 0.3
|
| 54 |
+
box_thresh: 0.6
|
| 55 |
+
max_candidates: 1000
|
| 56 |
+
unclip_ratio: 1.5
|
| 57 |
+
score_mode: 'slow'
|
| 58 |
+
|
| 59 |
+
Metric:
|
| 60 |
+
name: DetMetric
|
| 61 |
+
main_indicator: hmean
|
| 62 |
+
|
| 63 |
+
Train:
|
| 64 |
+
dataset:
|
| 65 |
+
name: SimpleDataSet
|
| 66 |
+
data_dir: ../icdar2015/text_localization/
|
| 67 |
+
label_file_list:
|
| 68 |
+
- ../icdar2015/text_localization/train_icdar2015_label.txt
|
| 69 |
+
ratio_list: [1.0]
|
| 70 |
+
transforms:
|
| 71 |
+
- DecodeImage:
|
| 72 |
+
img_mode: BGR
|
| 73 |
+
channel_first: false
|
| 74 |
+
- DetLabelEncode: null
|
| 75 |
+
- CopyPaste: null
|
| 76 |
+
- IaaAugment:
|
| 77 |
+
augmenter_args:
|
| 78 |
+
- type: Fliplr
|
| 79 |
+
args:
|
| 80 |
+
p: 0.5
|
| 81 |
+
- type: Affine
|
| 82 |
+
args:
|
| 83 |
+
rotate:
|
| 84 |
+
- -10
|
| 85 |
+
- 10
|
| 86 |
+
- type: Resize
|
| 87 |
+
args:
|
| 88 |
+
size:
|
| 89 |
+
- 0.5
|
| 90 |
+
- 3
|
| 91 |
+
- EastRandomCropData:
|
| 92 |
+
size:
|
| 93 |
+
- 640
|
| 94 |
+
- 640
|
| 95 |
+
max_tries: 50
|
| 96 |
+
keep_ratio: true
|
| 97 |
+
- MakeBorderMap:
|
| 98 |
+
shrink_ratio: 0.4
|
| 99 |
+
thresh_min: 0.3
|
| 100 |
+
thresh_max: 0.7
|
| 101 |
+
total_epoch: *epoch_num
|
| 102 |
+
- MakeShrinkMap:
|
| 103 |
+
shrink_ratio: 0.4
|
| 104 |
+
min_text_size: 8
|
| 105 |
+
total_epoch: *epoch_num
|
| 106 |
+
- NormalizeImage:
|
| 107 |
+
scale: 1./255.
|
| 108 |
+
mean:
|
| 109 |
+
- 0.485
|
| 110 |
+
- 0.456
|
| 111 |
+
- 0.406
|
| 112 |
+
std:
|
| 113 |
+
- 0.229
|
| 114 |
+
- 0.224
|
| 115 |
+
- 0.225
|
| 116 |
+
order: hwc
|
| 117 |
+
- ToCHWImage: null
|
| 118 |
+
- KeepKeys:
|
| 119 |
+
keep_keys:
|
| 120 |
+
- image
|
| 121 |
+
- threshold_map
|
| 122 |
+
- threshold_mask
|
| 123 |
+
- shrink_map
|
| 124 |
+
- shrink_mask
|
| 125 |
+
loader:
|
| 126 |
+
shuffle: true
|
| 127 |
+
drop_last: false
|
| 128 |
+
batch_size_per_card: 8
|
| 129 |
+
num_workers: 8
|
| 130 |
+
|
| 131 |
+
Eval:
|
| 132 |
+
dataset:
|
| 133 |
+
name: SimpleDataSet
|
| 134 |
+
data_dir: ../icdar2015/text_localization/
|
| 135 |
+
label_file_list:
|
| 136 |
+
- ../icdar2015/text_localization/test_icdar2015_label.txt
|
| 137 |
+
transforms:
|
| 138 |
+
- DecodeImage:
|
| 139 |
+
img_mode: BGR
|
| 140 |
+
channel_first: false
|
| 141 |
+
- DetLabelEncode: null
|
| 142 |
+
- DetResizeForTest:
|
| 143 |
+
# image_shape: [1280, 1280]
|
| 144 |
+
# keep_ratio: True
|
| 145 |
+
# padding: True
|
| 146 |
+
limit_side_len: 960
|
| 147 |
+
limit_type: max
|
| 148 |
+
- NormalizeImage:
|
| 149 |
+
scale: 1./255.
|
| 150 |
+
mean:
|
| 151 |
+
- 0.485
|
| 152 |
+
- 0.456
|
| 153 |
+
- 0.406
|
| 154 |
+
std:
|
| 155 |
+
- 0.229
|
| 156 |
+
- 0.224
|
| 157 |
+
- 0.225
|
| 158 |
+
order: hwc
|
| 159 |
+
- ToCHWImage: null
|
| 160 |
+
- KeepKeys:
|
| 161 |
+
keep_keys:
|
| 162 |
+
- image
|
| 163 |
+
- shape
|
| 164 |
+
- polys
|
| 165 |
+
- ignore_tags
|
| 166 |
+
loader:
|
| 167 |
+
shuffle: false
|
| 168 |
+
drop_last: false
|
| 169 |
+
batch_size_per_card: 1
|
| 170 |
+
num_workers: 2
|
| 171 |
+
profiler_options: null
|
configs/rec/abinet/resnet45_trans_abinet_lang.yml
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/resnet45_trans_abinet_lang/
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: True
|
| 10 |
+
pretrained_model:
|
| 11 |
+
# ./openocr_nolang_abinet_lang.pth
|
| 12 |
+
checkpoints:
|
| 13 |
+
use_tensorboard: false
|
| 14 |
+
infer_img:
|
| 15 |
+
# for data or label process
|
| 16 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 17 |
+
max_text_length: 25
|
| 18 |
+
use_space_char: False
|
| 19 |
+
save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_abinet_lang.txt
|
| 20 |
+
grad_clip_val: 20
|
| 21 |
+
use_amp: True
|
| 22 |
+
|
| 23 |
+
Optimizer:
|
| 24 |
+
name: Adam
|
| 25 |
+
lr: 0.000267
|
| 26 |
+
weight_decay: 0.0
|
| 27 |
+
filter_bias_and_bn: False
|
| 28 |
+
|
| 29 |
+
LRScheduler:
|
| 30 |
+
name: MultiStepLR
|
| 31 |
+
milestones: [12]
|
| 32 |
+
gamma: 0.1
|
| 33 |
+
|
| 34 |
+
Architecture:
|
| 35 |
+
model_type: rec
|
| 36 |
+
algorithm: ABINet
|
| 37 |
+
Transform:
|
| 38 |
+
Encoder:
|
| 39 |
+
name: ResNet45
|
| 40 |
+
in_channels: 3
|
| 41 |
+
strides: [2, 1, 2, 1, 1]
|
| 42 |
+
Decoder:
|
| 43 |
+
name: ABINetDecoder
|
| 44 |
+
iter_size: 3
|
| 45 |
+
|
| 46 |
+
Loss:
|
| 47 |
+
name: ABINetLoss
|
| 48 |
+
|
| 49 |
+
PostProcess:
|
| 50 |
+
name: ABINetLabelDecode
|
| 51 |
+
|
| 52 |
+
Metric:
|
| 53 |
+
name: RecMetric
|
| 54 |
+
main_indicator: acc
|
| 55 |
+
is_filter: True
|
| 56 |
+
|
| 57 |
+
Train:
|
| 58 |
+
dataset:
|
| 59 |
+
name: LMDBDataSet
|
| 60 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
| 61 |
+
transforms:
|
| 62 |
+
- DecodeImagePIL: # load image
|
| 63 |
+
img_mode: RGB
|
| 64 |
+
- PARSeqAugPIL:
|
| 65 |
+
- ABINetLabelEncode:
|
| 66 |
+
- RecTVResize:
|
| 67 |
+
image_shape: [32, 128]
|
| 68 |
+
padding: False
|
| 69 |
+
- KeepKeys:
|
| 70 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 71 |
+
loader:
|
| 72 |
+
shuffle: True
|
| 73 |
+
batch_size_per_card: 256
|
| 74 |
+
drop_last: True
|
| 75 |
+
num_workers: 4
|
| 76 |
+
|
| 77 |
+
Eval:
|
| 78 |
+
dataset:
|
| 79 |
+
name: LMDBDataSet
|
| 80 |
+
data_dir: ../evaluation
|
| 81 |
+
transforms:
|
| 82 |
+
- DecodeImagePIL: # load image
|
| 83 |
+
img_mode: RGB
|
| 84 |
+
- ABINetLabelEncode:
|
| 85 |
+
- RecTVResize:
|
| 86 |
+
image_shape: [32, 128]
|
| 87 |
+
padding: False
|
| 88 |
+
- KeepKeys:
|
| 89 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 90 |
+
loader:
|
| 91 |
+
shuffle: False
|
| 92 |
+
drop_last: False
|
| 93 |
+
batch_size_per_card: 256
|
| 94 |
+
num_workers: 2
|
configs/rec/abinet/resnet45_trans_abinet_wo_lang.yml
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/resnet45_trans_abinet_wo_lang/
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: True
|
| 10 |
+
pretrained_model:
|
| 11 |
+
checkpoints:
|
| 12 |
+
use_tensorboard: false
|
| 13 |
+
infer_img:
|
| 14 |
+
# for data or label process
|
| 15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 16 |
+
max_text_length: 25
|
| 17 |
+
use_space_char: False
|
| 18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_abinet_wo_lang.txt
|
| 19 |
+
grad_clip_val: 20
|
| 20 |
+
use_amp: True
|
| 21 |
+
|
| 22 |
+
Optimizer:
|
| 23 |
+
name: Adam
|
| 24 |
+
lr: 0.000267
|
| 25 |
+
weight_decay: 0.0
|
| 26 |
+
filter_bias_and_bn: False
|
| 27 |
+
|
| 28 |
+
LRScheduler:
|
| 29 |
+
name: MultiStepLR
|
| 30 |
+
milestones: [12]
|
| 31 |
+
gamma: 0.1
|
| 32 |
+
|
| 33 |
+
Architecture:
|
| 34 |
+
model_type: rec
|
| 35 |
+
algorithm: ABINet
|
| 36 |
+
Transform:
|
| 37 |
+
Encoder:
|
| 38 |
+
name: ResNet45
|
| 39 |
+
in_channels: 3
|
| 40 |
+
strides: [2, 1, 2, 1, 1]
|
| 41 |
+
Decoder:
|
| 42 |
+
name: ABINetDecoder
|
| 43 |
+
iter_size: 0
|
| 44 |
+
|
| 45 |
+
Loss:
|
| 46 |
+
name: ABINetLoss
|
| 47 |
+
|
| 48 |
+
PostProcess:
|
| 49 |
+
name: ABINetLabelDecode
|
| 50 |
+
|
| 51 |
+
Metric:
|
| 52 |
+
name: RecMetric
|
| 53 |
+
main_indicator: acc
|
| 54 |
+
is_filter: True
|
| 55 |
+
|
| 56 |
+
Train:
|
| 57 |
+
dataset:
|
| 58 |
+
name: LMDBDataSet
|
| 59 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
| 60 |
+
transforms:
|
| 61 |
+
- DecodeImagePIL: # load image
|
| 62 |
+
img_mode: RGB
|
| 63 |
+
- PARSeqAugPIL:
|
| 64 |
+
- ABINetLabelEncode:
|
| 65 |
+
- RecTVResize:
|
| 66 |
+
image_shape: [32, 128]
|
| 67 |
+
padding: False
|
| 68 |
+
- KeepKeys:
|
| 69 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 70 |
+
loader:
|
| 71 |
+
shuffle: True
|
| 72 |
+
batch_size_per_card: 256
|
| 73 |
+
drop_last: True
|
| 74 |
+
num_workers: 4
|
| 75 |
+
|
| 76 |
+
Eval:
|
| 77 |
+
dataset:
|
| 78 |
+
name: LMDBDataSet
|
| 79 |
+
data_dir: ../evaluation
|
| 80 |
+
transforms:
|
| 81 |
+
- DecodeImagePIL: # load image
|
| 82 |
+
img_mode: RGB
|
| 83 |
+
- ABINetLabelEncode:
|
| 84 |
+
- RecTVResize:
|
| 85 |
+
image_shape: [32, 128]
|
| 86 |
+
padding: False
|
| 87 |
+
- KeepKeys:
|
| 88 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 89 |
+
loader:
|
| 90 |
+
shuffle: False
|
| 91 |
+
drop_last: False
|
| 92 |
+
batch_size_per_card: 256
|
| 93 |
+
num_workers: 2
|
configs/rec/abinet/svtrv2_abinet_lang.yml
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_abinet_lang/
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: True
|
| 10 |
+
pretrained_model:
|
| 11 |
+
# ./openocr_svtrv2_nolang_abinet_lang.pth
|
| 12 |
+
checkpoints:
|
| 13 |
+
use_tensorboard: false
|
| 14 |
+
infer_img:
|
| 15 |
+
# for data or label process
|
| 16 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 17 |
+
max_text_length: 25
|
| 18 |
+
use_space_char: False
|
| 19 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_abinet_lang.txt
|
| 20 |
+
use_amp: True
|
| 21 |
+
grad_clip_val: 20
|
| 22 |
+
|
| 23 |
+
Optimizer:
|
| 24 |
+
name: AdamW
|
| 25 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
| 26 |
+
weight_decay: 0.05
|
| 27 |
+
filter_bias_and_bn: True
|
| 28 |
+
|
| 29 |
+
LRScheduler:
|
| 30 |
+
name: OneCycleLR
|
| 31 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
| 32 |
+
cycle_momentum: False
|
| 33 |
+
|
| 34 |
+
Architecture:
|
| 35 |
+
model_type: rec
|
| 36 |
+
algorithm: ABINet
|
| 37 |
+
Transform:
|
| 38 |
+
Encoder:
|
| 39 |
+
name: SVTRv2LNConvTwo33
|
| 40 |
+
use_pos_embed: False
|
| 41 |
+
dims: [128, 256, 384]
|
| 42 |
+
depths: [6, 6, 6]
|
| 43 |
+
num_heads: [4, 8, 12]
|
| 44 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
| 45 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
| 46 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
| 47 |
+
last_stage: false
|
| 48 |
+
feat2d: True
|
| 49 |
+
Decoder:
|
| 50 |
+
name: ABINetDecoder
|
| 51 |
+
iter_size: 3
|
| 52 |
+
num_layers: 0
|
| 53 |
+
|
| 54 |
+
Loss:
|
| 55 |
+
name: ABINetLoss
|
| 56 |
+
|
| 57 |
+
PostProcess:
|
| 58 |
+
name: ABINetLabelDecode
|
| 59 |
+
|
| 60 |
+
Metric:
|
| 61 |
+
name: RecMetric
|
| 62 |
+
main_indicator: acc
|
| 63 |
+
is_filter: True
|
| 64 |
+
|
| 65 |
+
Train:
|
| 66 |
+
dataset:
|
| 67 |
+
name: RatioDataSetTVResize
|
| 68 |
+
ds_width: True
|
| 69 |
+
padding: false
|
| 70 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
| 71 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
| 72 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
| 73 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
| 74 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
| 75 |
+
]
|
| 76 |
+
transforms:
|
| 77 |
+
- DecodeImagePIL: # load image
|
| 78 |
+
img_mode: RGB
|
| 79 |
+
- PARSeqAugPIL:
|
| 80 |
+
- ABINetLabelEncode:
|
| 81 |
+
- KeepKeys:
|
| 82 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 83 |
+
sampler:
|
| 84 |
+
name: RatioSampler
|
| 85 |
+
scales: [[128, 32]] # w, h
|
| 86 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 87 |
+
first_bs: &bs 256
|
| 88 |
+
fix_bs: false
|
| 89 |
+
divided_factor: [4, 16] # w, h
|
| 90 |
+
is_training: True
|
| 91 |
+
loader:
|
| 92 |
+
shuffle: True
|
| 93 |
+
batch_size_per_card: *bs
|
| 94 |
+
drop_last: True
|
| 95 |
+
max_ratio: &max_ratio 4
|
| 96 |
+
num_workers: 4
|
| 97 |
+
|
| 98 |
+
Eval:
|
| 99 |
+
dataset:
|
| 100 |
+
name: RatioDataSetTVResize
|
| 101 |
+
ds_width: True
|
| 102 |
+
padding: False
|
| 103 |
+
data_dir_list: [
|
| 104 |
+
'../evaluation/CUTE80',
|
| 105 |
+
'../evaluation/IC13_857',
|
| 106 |
+
'../evaluation/IC15_1811',
|
| 107 |
+
'../evaluation/IIIT5k',
|
| 108 |
+
'../evaluation/SVT',
|
| 109 |
+
'../evaluation/SVTP',
|
| 110 |
+
]
|
| 111 |
+
transforms:
|
| 112 |
+
- DecodeImagePIL: # load image
|
| 113 |
+
img_mode: RGB
|
| 114 |
+
- ABINetLabelEncode:
|
| 115 |
+
- KeepKeys:
|
| 116 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 117 |
+
sampler:
|
| 118 |
+
name: RatioSampler
|
| 119 |
+
scales: [[128, 32]] # w, h
|
| 120 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 121 |
+
first_bs: *bs
|
| 122 |
+
fix_bs: false
|
| 123 |
+
divided_factor: [4, 16] # w, h
|
| 124 |
+
is_training: False
|
| 125 |
+
loader:
|
| 126 |
+
shuffle: False
|
| 127 |
+
drop_last: False
|
| 128 |
+
batch_size_per_card: *bs
|
| 129 |
+
max_ratio: *max_ratio
|
| 130 |
+
num_workers: 4
|
configs/rec/abinet/svtrv2_abinet_wo_lang.yml
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_abinet_wo_lang/
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: True
|
| 10 |
+
pretrained_model:
|
| 11 |
+
checkpoints:
|
| 12 |
+
use_tensorboard: false
|
| 13 |
+
infer_img:
|
| 14 |
+
# for data or label process
|
| 15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 16 |
+
max_text_length: 25
|
| 17 |
+
use_space_char: False
|
| 18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_abinet_wo_lang.txt
|
| 19 |
+
use_amp: True
|
| 20 |
+
grad_clip_val: 20
|
| 21 |
+
|
| 22 |
+
Optimizer:
|
| 23 |
+
name: AdamW
|
| 24 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
| 25 |
+
weight_decay: 0.05
|
| 26 |
+
filter_bias_and_bn: True
|
| 27 |
+
|
| 28 |
+
LRScheduler:
|
| 29 |
+
name: OneCycleLR
|
| 30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
| 31 |
+
cycle_momentum: False
|
| 32 |
+
|
| 33 |
+
Architecture:
|
| 34 |
+
model_type: rec
|
| 35 |
+
algorithm: ABINet
|
| 36 |
+
Transform:
|
| 37 |
+
Encoder:
|
| 38 |
+
name: SVTRv2LNConvTwo33
|
| 39 |
+
use_pos_embed: False
|
| 40 |
+
dims: [128, 256, 384]
|
| 41 |
+
depths: [6, 6, 6]
|
| 42 |
+
num_heads: [4, 8, 12]
|
| 43 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
| 44 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
| 45 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
| 46 |
+
last_stage: false
|
| 47 |
+
feat2d: True
|
| 48 |
+
Decoder:
|
| 49 |
+
name: ABINetDecoder
|
| 50 |
+
iter_size: 0
|
| 51 |
+
num_layers: 0
|
| 52 |
+
Loss:
|
| 53 |
+
name: ABINetLoss
|
| 54 |
+
|
| 55 |
+
PostProcess:
|
| 56 |
+
name: ABINetLabelDecode
|
| 57 |
+
|
| 58 |
+
Metric:
|
| 59 |
+
name: RecMetric
|
| 60 |
+
main_indicator: acc
|
| 61 |
+
is_filter: True
|
| 62 |
+
|
| 63 |
+
Train:
|
| 64 |
+
dataset:
|
| 65 |
+
name: RatioDataSetTVResize
|
| 66 |
+
ds_width: True
|
| 67 |
+
padding: false
|
| 68 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
| 69 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
| 70 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
| 71 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
| 72 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
| 73 |
+
]
|
| 74 |
+
transforms:
|
| 75 |
+
- DecodeImagePIL: # load image
|
| 76 |
+
img_mode: RGB
|
| 77 |
+
- PARSeqAugPIL:
|
| 78 |
+
- ABINetLabelEncode:
|
| 79 |
+
- KeepKeys:
|
| 80 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 81 |
+
sampler:
|
| 82 |
+
name: RatioSampler
|
| 83 |
+
scales: [[128, 32]] # w, h
|
| 84 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 85 |
+
first_bs: &bs 256
|
| 86 |
+
fix_bs: false
|
| 87 |
+
divided_factor: [4, 16] # w, h
|
| 88 |
+
is_training: True
|
| 89 |
+
loader:
|
| 90 |
+
shuffle: True
|
| 91 |
+
batch_size_per_card: *bs
|
| 92 |
+
drop_last: True
|
| 93 |
+
max_ratio: &max_ratio 4
|
| 94 |
+
num_workers: 4
|
| 95 |
+
|
| 96 |
+
Eval:
|
| 97 |
+
dataset:
|
| 98 |
+
name: RatioDataSetTVResize
|
| 99 |
+
ds_width: True
|
| 100 |
+
padding: False
|
| 101 |
+
data_dir_list: [
|
| 102 |
+
'../evaluation/CUTE80',
|
| 103 |
+
'../evaluation/IC13_857',
|
| 104 |
+
'../evaluation/IC15_1811',
|
| 105 |
+
'../evaluation/IIIT5k',
|
| 106 |
+
'../evaluation/SVT',
|
| 107 |
+
'../evaluation/SVTP',
|
| 108 |
+
]
|
| 109 |
+
transforms:
|
| 110 |
+
- DecodeImagePIL: # load image
|
| 111 |
+
img_mode: RGB
|
| 112 |
+
- ABINetLabelEncode:
|
| 113 |
+
- KeepKeys:
|
| 114 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 115 |
+
sampler:
|
| 116 |
+
name: RatioSampler
|
| 117 |
+
scales: [[128, 32]] # w, h
|
| 118 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 119 |
+
first_bs: *bs
|
| 120 |
+
fix_bs: false
|
| 121 |
+
divided_factor: [4, 16] # w, h
|
| 122 |
+
is_training: False
|
| 123 |
+
loader:
|
| 124 |
+
shuffle: False
|
| 125 |
+
drop_last: False
|
| 126 |
+
batch_size_per_card: *bs
|
| 127 |
+
max_ratio: *max_ratio
|
| 128 |
+
num_workers: 4
|
configs/rec/aster/resnet31_lstm_aster_tps_on.yml
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/resnet31_lstm_aster_tps_on
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: True
|
| 10 |
+
pretrained_model:
|
| 11 |
+
checkpoints:
|
| 12 |
+
use_tensorboard: false
|
| 13 |
+
infer_img:
|
| 14 |
+
# for data or label process
|
| 15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 16 |
+
max_text_length: 25
|
| 17 |
+
use_space_char: False
|
| 18 |
+
save_res_path: ./output/rec/predicts_aster_tps.txt
|
| 19 |
+
use_amp: True
|
| 20 |
+
grad_clip_val: 1.0
|
| 21 |
+
|
| 22 |
+
Optimizer:
|
| 23 |
+
name: Adam
|
| 24 |
+
lr: 0.002 # for 1gpus bs1024/gpu
|
| 25 |
+
weight_decay: 0.0
|
| 26 |
+
filter_bias_and_bn: False
|
| 27 |
+
|
| 28 |
+
LRScheduler:
|
| 29 |
+
name: OneCycleLR
|
| 30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
| 31 |
+
cycle_momentum: False
|
| 32 |
+
|
| 33 |
+
Architecture:
|
| 34 |
+
model_type: rec
|
| 35 |
+
algorithm: aster
|
| 36 |
+
Transform:
|
| 37 |
+
name: Aster_TPS
|
| 38 |
+
tps_inputsize: [32, 64]
|
| 39 |
+
tps_outputsize: [32, 128]
|
| 40 |
+
Encoder:
|
| 41 |
+
name: ResNet_ASTER
|
| 42 |
+
Decoder:
|
| 43 |
+
name: ASTERDecoder
|
| 44 |
+
|
| 45 |
+
Loss:
|
| 46 |
+
name: ARLoss
|
| 47 |
+
|
| 48 |
+
Metric:
|
| 49 |
+
name: RecMetric
|
| 50 |
+
main_indicator: acc
|
| 51 |
+
is_filter: True
|
| 52 |
+
|
| 53 |
+
PostProcess:
|
| 54 |
+
name: ARLabelDecode
|
| 55 |
+
|
| 56 |
+
Train:
|
| 57 |
+
dataset:
|
| 58 |
+
name: LMDBDataSet
|
| 59 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
| 60 |
+
transforms:
|
| 61 |
+
- DecodeImagePIL: # load image
|
| 62 |
+
img_mode: RGB
|
| 63 |
+
- PARSeqAugPIL:
|
| 64 |
+
- ARLabelEncode: # Class handling label
|
| 65 |
+
- RecTVResize:
|
| 66 |
+
image_shape: [64, 256]
|
| 67 |
+
padding: False
|
| 68 |
+
- KeepKeys:
|
| 69 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 70 |
+
loader:
|
| 71 |
+
shuffle: True
|
| 72 |
+
batch_size_per_card: 1024
|
| 73 |
+
drop_last: True
|
| 74 |
+
num_workers: 4
|
| 75 |
+
|
| 76 |
+
Eval:
|
| 77 |
+
dataset:
|
| 78 |
+
name: LMDBDataSet
|
| 79 |
+
data_dir: ../evaluation
|
| 80 |
+
transforms:
|
| 81 |
+
- DecodeImagePIL: # load image
|
| 82 |
+
img_mode: RGB
|
| 83 |
+
- ARLabelEncode: # Class handling label
|
| 84 |
+
- RecTVResize:
|
| 85 |
+
image_shape: [64, 256]
|
| 86 |
+
padding: False
|
| 87 |
+
- KeepKeys:
|
| 88 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 89 |
+
loader:
|
| 90 |
+
shuffle: False
|
| 91 |
+
drop_last: False
|
| 92 |
+
batch_size_per_card: 256
|
| 93 |
+
num_workers: 2
|
configs/rec/aster/svtrv2_aster.yml
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_aster
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: True
|
| 10 |
+
pretrained_model:
|
| 11 |
+
checkpoints:
|
| 12 |
+
use_tensorboard: false
|
| 13 |
+
infer_img:
|
| 14 |
+
# for data or label process
|
| 15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 16 |
+
max_text_length: 25
|
| 17 |
+
use_space_char: False
|
| 18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_aster.txt
|
| 19 |
+
use_amp: True
|
| 20 |
+
|
| 21 |
+
Optimizer:
|
| 22 |
+
name: AdamW
|
| 23 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
| 24 |
+
weight_decay: 0.05
|
| 25 |
+
filter_bias_and_bn: True
|
| 26 |
+
|
| 27 |
+
LRScheduler:
|
| 28 |
+
name: OneCycleLR
|
| 29 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
| 30 |
+
cycle_momentum: False
|
| 31 |
+
|
| 32 |
+
Architecture:
|
| 33 |
+
model_type: rec
|
| 34 |
+
algorithm: aster
|
| 35 |
+
Transform:
|
| 36 |
+
Encoder:
|
| 37 |
+
name: SVTRv2LNConvTwo33
|
| 38 |
+
use_pos_embed: False
|
| 39 |
+
out_channels: 256
|
| 40 |
+
dims: [128, 256, 384]
|
| 41 |
+
depths: [6, 6, 6]
|
| 42 |
+
num_heads: [4, 8, 12]
|
| 43 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
| 44 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
| 45 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
| 46 |
+
last_stage: false
|
| 47 |
+
feat2d: False
|
| 48 |
+
Decoder:
|
| 49 |
+
name: ASTERDecoder
|
| 50 |
+
|
| 51 |
+
Loss:
|
| 52 |
+
name: ARLoss
|
| 53 |
+
|
| 54 |
+
Metric:
|
| 55 |
+
name: RecMetric
|
| 56 |
+
main_indicator: acc
|
| 57 |
+
is_filter: True
|
| 58 |
+
|
| 59 |
+
PostProcess:
|
| 60 |
+
name: ARLabelDecode
|
| 61 |
+
|
| 62 |
+
Train:
|
| 63 |
+
dataset:
|
| 64 |
+
name: RatioDataSetTVResize
|
| 65 |
+
ds_width: True
|
| 66 |
+
padding: false
|
| 67 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
| 68 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
| 69 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
| 70 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
| 71 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
| 72 |
+
]
|
| 73 |
+
transforms:
|
| 74 |
+
- DecodeImagePIL: # load image
|
| 75 |
+
img_mode: RGB
|
| 76 |
+
- PARSeqAugPIL:
|
| 77 |
+
- ARLabelEncode: # Class handling label
|
| 78 |
+
- KeepKeys:
|
| 79 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 80 |
+
sampler:
|
| 81 |
+
name: RatioSampler
|
| 82 |
+
scales: [[128, 32]] # w, h
|
| 83 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 84 |
+
first_bs: &bs 256
|
| 85 |
+
fix_bs: false
|
| 86 |
+
divided_factor: [4, 16] # w, h
|
| 87 |
+
is_training: True
|
| 88 |
+
loader:
|
| 89 |
+
shuffle: True
|
| 90 |
+
batch_size_per_card: *bs
|
| 91 |
+
drop_last: True
|
| 92 |
+
max_ratio: &max_ratio 4
|
| 93 |
+
num_workers: 4
|
| 94 |
+
|
| 95 |
+
Eval:
|
| 96 |
+
dataset:
|
| 97 |
+
name: RatioDataSetTVResize
|
| 98 |
+
ds_width: True
|
| 99 |
+
padding: False
|
| 100 |
+
data_dir_list: [
|
| 101 |
+
'../evaluation/CUTE80',
|
| 102 |
+
'../evaluation/IC13_857',
|
| 103 |
+
'../evaluation/IC15_1811',
|
| 104 |
+
'../evaluation/IIIT5k',
|
| 105 |
+
'../evaluation/SVT',
|
| 106 |
+
'../evaluation/SVTP',
|
| 107 |
+
]
|
| 108 |
+
transforms:
|
| 109 |
+
- DecodeImagePIL: # load image
|
| 110 |
+
img_mode: RGB
|
| 111 |
+
- ARLabelEncode: # Class handling label
|
| 112 |
+
- KeepKeys:
|
| 113 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 114 |
+
sampler:
|
| 115 |
+
name: RatioSampler
|
| 116 |
+
scales: [[128, 32]] # w, h
|
| 117 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 118 |
+
first_bs: *bs
|
| 119 |
+
fix_bs: false
|
| 120 |
+
divided_factor: [4, 16] # w, h
|
| 121 |
+
is_training: False
|
| 122 |
+
loader:
|
| 123 |
+
shuffle: False
|
| 124 |
+
drop_last: False
|
| 125 |
+
batch_size_per_card: *bs
|
| 126 |
+
max_ratio: *max_ratio
|
| 127 |
+
num_workers: 4
|
configs/rec/aster/svtrv2_aster_tps_on.yml
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_aster_tps_on
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: True
|
| 10 |
+
pretrained_model:
|
| 11 |
+
checkpoints:
|
| 12 |
+
use_tensorboard: false
|
| 13 |
+
infer_img:
|
| 14 |
+
# for data or label process
|
| 15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 16 |
+
max_text_length: 25
|
| 17 |
+
use_space_char: False
|
| 18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_aster_tps_on.txt
|
| 19 |
+
use_amp: True
|
| 20 |
+
|
| 21 |
+
Optimizer:
|
| 22 |
+
name: AdamW
|
| 23 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
| 24 |
+
weight_decay: 0.05
|
| 25 |
+
filter_bias_and_bn: True
|
| 26 |
+
|
| 27 |
+
LRScheduler:
|
| 28 |
+
name: OneCycleLR
|
| 29 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
| 30 |
+
cycle_momentum: False
|
| 31 |
+
|
| 32 |
+
Architecture:
|
| 33 |
+
model_type: rec
|
| 34 |
+
algorithm: aster
|
| 35 |
+
Transform:
|
| 36 |
+
name: Aster_TPS
|
| 37 |
+
tps_inputsize: [32, 64]
|
| 38 |
+
tps_outputsize: [32, 128]
|
| 39 |
+
Encoder:
|
| 40 |
+
name: SVTRv2LNConvTwo33
|
| 41 |
+
use_pos_embed: False
|
| 42 |
+
out_channels: 256
|
| 43 |
+
dims: [128, 256, 384]
|
| 44 |
+
depths: [6, 6, 6]
|
| 45 |
+
num_heads: [4, 8, 12]
|
| 46 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
| 47 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
| 48 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
| 49 |
+
last_stage: false
|
| 50 |
+
feat2d: False
|
| 51 |
+
Decoder:
|
| 52 |
+
name: ASTERDecoder
|
| 53 |
+
|
| 54 |
+
Loss:
|
| 55 |
+
name: ARLoss
|
| 56 |
+
|
| 57 |
+
Metric:
|
| 58 |
+
name: RecMetric
|
| 59 |
+
main_indicator: acc
|
| 60 |
+
is_filter: True
|
| 61 |
+
|
| 62 |
+
PostProcess:
|
| 63 |
+
name: ARLabelDecode
|
| 64 |
+
|
| 65 |
+
Train:
|
| 66 |
+
dataset:
|
| 67 |
+
name: LMDBDataSet
|
| 68 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
| 69 |
+
transforms:
|
| 70 |
+
- DecodeImagePIL: # load image
|
| 71 |
+
img_mode: RGB
|
| 72 |
+
- PARSeqAugPIL:
|
| 73 |
+
- ARLabelEncode: # Class handling label
|
| 74 |
+
- RecTVResize:
|
| 75 |
+
image_shape: [64, 256]
|
| 76 |
+
padding: False
|
| 77 |
+
- KeepKeys:
|
| 78 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 79 |
+
loader:
|
| 80 |
+
shuffle: True
|
| 81 |
+
batch_size_per_card: 256
|
| 82 |
+
drop_last: True
|
| 83 |
+
num_workers: 4
|
| 84 |
+
|
| 85 |
+
Eval:
|
| 86 |
+
dataset:
|
| 87 |
+
name: LMDBDataSet
|
| 88 |
+
data_dir: ../evaluation
|
| 89 |
+
transforms:
|
| 90 |
+
- DecodeImagePIL: # load image
|
| 91 |
+
img_mode: RGB
|
| 92 |
+
- ARLabelEncode: # Class handling label
|
| 93 |
+
- RecTVResize:
|
| 94 |
+
image_shape: [64, 256]
|
| 95 |
+
padding: False
|
| 96 |
+
- KeepKeys:
|
| 97 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 98 |
+
loader:
|
| 99 |
+
shuffle: False
|
| 100 |
+
drop_last: False
|
| 101 |
+
batch_size_per_card: 256
|
| 102 |
+
num_workers: 2
|
configs/rec/autostr/autostr_lstm_aster_tps_on.yml
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/autostr_lstm_aster_tps_on
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: True
|
| 10 |
+
pretrained_model:
|
| 11 |
+
checkpoints:
|
| 12 |
+
use_tensorboard: false
|
| 13 |
+
infer_img:
|
| 14 |
+
# for data or label process
|
| 15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 16 |
+
max_text_length: 25
|
| 17 |
+
use_space_char: False
|
| 18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_autostr_lstm_aster_tps_on.txt
|
| 19 |
+
use_amp: True
|
| 20 |
+
grad_clip_val: 1.0
|
| 21 |
+
|
| 22 |
+
Optimizer:
|
| 23 |
+
name: Adam
|
| 24 |
+
lr: 0.002 # for 4gpus bs256/gpu
|
| 25 |
+
weight_decay: 0.0
|
| 26 |
+
filter_bias_and_bn: False
|
| 27 |
+
|
| 28 |
+
LRScheduler:
|
| 29 |
+
name: OneCycleLR
|
| 30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
| 31 |
+
cycle_momentum: False
|
| 32 |
+
|
| 33 |
+
Architecture:
|
| 34 |
+
model_type: rec
|
| 35 |
+
algorithm: autostr
|
| 36 |
+
Transform:
|
| 37 |
+
name: Aster_TPS
|
| 38 |
+
tps_inputsize: [32, 64]
|
| 39 |
+
tps_outputsize: [32, 128]
|
| 40 |
+
Encoder:
|
| 41 |
+
name: AutoSTREncoder
|
| 42 |
+
stride_stages: '[(2, 2), (2, 1), (2, 2), (2, 1), (2, 1)]'
|
| 43 |
+
conv_op_ids: [2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 4, 1, 1, 6, 6]
|
| 44 |
+
Decoder:
|
| 45 |
+
name: ASTERDecoder
|
| 46 |
+
|
| 47 |
+
Loss:
|
| 48 |
+
name: ARLoss
|
| 49 |
+
|
| 50 |
+
Metric:
|
| 51 |
+
name: RecMetric
|
| 52 |
+
main_indicator: acc
|
| 53 |
+
is_filter: True
|
| 54 |
+
|
| 55 |
+
PostProcess:
|
| 56 |
+
name: ARLabelDecode
|
| 57 |
+
|
| 58 |
+
Train:
|
| 59 |
+
dataset:
|
| 60 |
+
name: LMDBDataSet
|
| 61 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
| 62 |
+
transforms:
|
| 63 |
+
- DecodeImagePIL: # load image
|
| 64 |
+
img_mode: RGB
|
| 65 |
+
- PARSeqAugPIL:
|
| 66 |
+
- ARLabelEncode: # Class handling label
|
| 67 |
+
- RecTVResize:
|
| 68 |
+
image_shape: [64, 256]
|
| 69 |
+
padding: False
|
| 70 |
+
- KeepKeys:
|
| 71 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 72 |
+
loader:
|
| 73 |
+
shuffle: True
|
| 74 |
+
batch_size_per_card: 256
|
| 75 |
+
drop_last: True
|
| 76 |
+
num_workers: 4
|
| 77 |
+
|
| 78 |
+
Eval:
|
| 79 |
+
dataset:
|
| 80 |
+
name: LMDBDataSet
|
| 81 |
+
data_dir: ../evaluation
|
| 82 |
+
transforms:
|
| 83 |
+
- DecodeImagePIL: # load image
|
| 84 |
+
img_mode: RGB
|
| 85 |
+
- ARLabelEncode: # Class handling label
|
| 86 |
+
- RecTVResize:
|
| 87 |
+
image_shape: [64, 256]
|
| 88 |
+
padding: False
|
| 89 |
+
- KeepKeys:
|
| 90 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 91 |
+
loader:
|
| 92 |
+
shuffle: False
|
| 93 |
+
drop_last: False
|
| 94 |
+
batch_size_per_card: 256
|
| 95 |
+
num_workers: 2
|
configs/rec/busnet/svtrv2_busnet.yml
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 10
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_busnet/
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: True
|
| 10 |
+
pretrained_model:
|
| 11 |
+
# ./output/rec/u14m_filter/svtrv2_busnet_pretraining/best.pth
|
| 12 |
+
checkpoints:
|
| 13 |
+
use_tensorboard: false
|
| 14 |
+
infer_img:
|
| 15 |
+
# for data or label process
|
| 16 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 17 |
+
max_text_length: 25
|
| 18 |
+
use_space_char: False
|
| 19 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_busnet.txt
|
| 20 |
+
use_amp: True
|
| 21 |
+
|
| 22 |
+
Optimizer:
|
| 23 |
+
name: AdamW
|
| 24 |
+
lr: 0.00065 # 4gpus bs256/gpu
|
| 25 |
+
weight_decay: 0.05
|
| 26 |
+
filter_bias_and_bn: True
|
| 27 |
+
|
| 28 |
+
LRScheduler:
|
| 29 |
+
name: OneCycleLR
|
| 30 |
+
warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
|
| 31 |
+
cycle_momentum: False
|
| 32 |
+
|
| 33 |
+
Architecture:
|
| 34 |
+
model_type: rec
|
| 35 |
+
algorithm: BUSBet
|
| 36 |
+
Transform:
|
| 37 |
+
Encoder:
|
| 38 |
+
name: SVTRv2LNConvTwo33
|
| 39 |
+
use_pos_embed: False
|
| 40 |
+
dims: [128, 256, 384]
|
| 41 |
+
depths: [6, 6, 6]
|
| 42 |
+
num_heads: [4, 8, 12]
|
| 43 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
| 44 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
| 45 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
| 46 |
+
last_stage: false
|
| 47 |
+
feat2d: False
|
| 48 |
+
Decoder:
|
| 49 |
+
name: BUSDecoder
|
| 50 |
+
nhead: 6
|
| 51 |
+
num_layers: 6
|
| 52 |
+
dim_feedforward: 1536
|
| 53 |
+
ignore_index: &ignore_index 100
|
| 54 |
+
pretraining: False
|
| 55 |
+
# return_id: 2
|
| 56 |
+
Loss:
|
| 57 |
+
name: ABINetLoss
|
| 58 |
+
ignore_index: *ignore_index
|
| 59 |
+
|
| 60 |
+
PostProcess:
|
| 61 |
+
name: ABINetLabelDecode
|
| 62 |
+
|
| 63 |
+
Metric:
|
| 64 |
+
name: RecMetric
|
| 65 |
+
main_indicator: acc
|
| 66 |
+
is_filter: True
|
| 67 |
+
|
| 68 |
+
Train:
|
| 69 |
+
dataset:
|
| 70 |
+
name: RatioDataSetTVResize
|
| 71 |
+
ds_width: True
|
| 72 |
+
padding: false
|
| 73 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
| 74 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
| 75 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
| 76 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
| 77 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
| 78 |
+
]
|
| 79 |
+
transforms:
|
| 80 |
+
- DecodeImagePIL: # load image
|
| 81 |
+
img_mode: RGB
|
| 82 |
+
- PARSeqAugPIL:
|
| 83 |
+
- ABINetLabelEncode:
|
| 84 |
+
ignore_index: *ignore_index
|
| 85 |
+
- KeepKeys:
|
| 86 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 87 |
+
sampler:
|
| 88 |
+
name: RatioSampler
|
| 89 |
+
scales: [[128, 32]] # w, h
|
| 90 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 91 |
+
first_bs: &bs 256
|
| 92 |
+
fix_bs: false
|
| 93 |
+
divided_factor: [4, 16] # w, h
|
| 94 |
+
is_training: True
|
| 95 |
+
loader:
|
| 96 |
+
shuffle: True
|
| 97 |
+
batch_size_per_card: *bs
|
| 98 |
+
drop_last: True
|
| 99 |
+
max_ratio: &max_ratio 4
|
| 100 |
+
num_workers: 4
|
| 101 |
+
|
| 102 |
+
Eval:
|
| 103 |
+
dataset:
|
| 104 |
+
name: RatioDataSetTVResize
|
| 105 |
+
ds_width: True
|
| 106 |
+
padding: False
|
| 107 |
+
data_dir_list: [
|
| 108 |
+
'../evaluation/CUTE80',
|
| 109 |
+
'../evaluation/IC13_857',
|
| 110 |
+
'../evaluation/IC15_1811',
|
| 111 |
+
'../evaluation/IIIT5k',
|
| 112 |
+
'../evaluation/SVT',
|
| 113 |
+
'../evaluation/SVTP',
|
| 114 |
+
]
|
| 115 |
+
transforms:
|
| 116 |
+
- DecodeImagePIL: # load image
|
| 117 |
+
img_mode: RGB
|
| 118 |
+
- ABINetLabelEncode:
|
| 119 |
+
ignore_index: *ignore_index
|
| 120 |
+
- KeepKeys:
|
| 121 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 122 |
+
sampler:
|
| 123 |
+
name: RatioSampler
|
| 124 |
+
scales: [[128, 32]] # w, h
|
| 125 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 126 |
+
first_bs: *bs
|
| 127 |
+
fix_bs: false
|
| 128 |
+
divided_factor: [4, 16] # w, h
|
| 129 |
+
is_training: False
|
| 130 |
+
loader:
|
| 131 |
+
shuffle: False
|
| 132 |
+
drop_last: False
|
| 133 |
+
batch_size_per_card: *bs
|
| 134 |
+
max_ratio: *max_ratio
|
| 135 |
+
num_workers: 4
|
configs/rec/busnet/svtrv2_busnet_pretraining.yml
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 10
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_busnet_pretraining/
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: True
|
| 10 |
+
pretrained_model:
|
| 11 |
+
checkpoints:
|
| 12 |
+
use_tensorboard: false
|
| 13 |
+
infer_img:
|
| 14 |
+
# for data or label process
|
| 15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 16 |
+
max_text_length: 25
|
| 17 |
+
use_space_char: False
|
| 18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_busnet_pretraining.txt
|
| 19 |
+
use_amp: True
|
| 20 |
+
|
| 21 |
+
Optimizer:
|
| 22 |
+
name: AdamW
|
| 23 |
+
lr: 0.00065 # 4gpus bs256/gpu
|
| 24 |
+
weight_decay: 0.05
|
| 25 |
+
filter_bias_and_bn: True
|
| 26 |
+
|
| 27 |
+
LRScheduler:
|
| 28 |
+
name: OneCycleLR
|
| 29 |
+
warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
|
| 30 |
+
cycle_momentum: False
|
| 31 |
+
|
| 32 |
+
Architecture:
|
| 33 |
+
model_type: rec
|
| 34 |
+
algorithm: BUSBet
|
| 35 |
+
Transform:
|
| 36 |
+
Encoder:
|
| 37 |
+
name: SVTRv2LNConvTwo33
|
| 38 |
+
use_pos_embed: False
|
| 39 |
+
dims: [128, 256, 384]
|
| 40 |
+
depths: [6, 6, 6]
|
| 41 |
+
num_heads: [4, 8, 12]
|
| 42 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
| 43 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
| 44 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
| 45 |
+
last_stage: false
|
| 46 |
+
feat2d: False
|
| 47 |
+
Decoder:
|
| 48 |
+
name: BUSDecoder
|
| 49 |
+
nhead: 6
|
| 50 |
+
num_layers: 6
|
| 51 |
+
dim_feedforward: 1536
|
| 52 |
+
ignore_index: &ignore_index 100
|
| 53 |
+
pretraining: True
|
| 54 |
+
# return_id: 0
|
| 55 |
+
Loss:
|
| 56 |
+
name: ABINetLoss
|
| 57 |
+
ignore_index: *ignore_index
|
| 58 |
+
|
| 59 |
+
PostProcess:
|
| 60 |
+
name: ABINetLabelDecode
|
| 61 |
+
|
| 62 |
+
Metric:
|
| 63 |
+
name: RecMetric
|
| 64 |
+
main_indicator: acc
|
| 65 |
+
is_filter: True
|
| 66 |
+
|
| 67 |
+
Train:
|
| 68 |
+
dataset:
|
| 69 |
+
name: RatioDataSetTVResize
|
| 70 |
+
ds_width: True
|
| 71 |
+
padding: false
|
| 72 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
| 73 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
| 74 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
| 75 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
| 76 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
| 77 |
+
]
|
| 78 |
+
transforms:
|
| 79 |
+
- DecodeImagePIL: # load image
|
| 80 |
+
img_mode: RGB
|
| 81 |
+
- PARSeqAugPIL:
|
| 82 |
+
- ABINetLabelEncode:
|
| 83 |
+
ignore_index: *ignore_index
|
| 84 |
+
- KeepKeys:
|
| 85 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 86 |
+
sampler:
|
| 87 |
+
name: RatioSampler
|
| 88 |
+
scales: [[128, 32]] # w, h
|
| 89 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 90 |
+
first_bs: &bs 256
|
| 91 |
+
fix_bs: false
|
| 92 |
+
divided_factor: [4, 16] # w, h
|
| 93 |
+
is_training: True
|
| 94 |
+
loader:
|
| 95 |
+
shuffle: True
|
| 96 |
+
batch_size_per_card: *bs
|
| 97 |
+
drop_last: True
|
| 98 |
+
max_ratio: &max_ratio 4
|
| 99 |
+
num_workers: 4
|
| 100 |
+
|
| 101 |
+
Eval:
|
| 102 |
+
dataset:
|
| 103 |
+
name: RatioDataSetTVResize
|
| 104 |
+
ds_width: True
|
| 105 |
+
padding: False
|
| 106 |
+
data_dir_list: [
|
| 107 |
+
'../evaluation/CUTE80',
|
| 108 |
+
'../evaluation/IC13_857',
|
| 109 |
+
'../evaluation/IC15_1811',
|
| 110 |
+
'../evaluation/IIIT5k',
|
| 111 |
+
'../evaluation/SVT',
|
| 112 |
+
'../evaluation/SVTP',
|
| 113 |
+
]
|
| 114 |
+
transforms:
|
| 115 |
+
- DecodeImagePIL: # load image
|
| 116 |
+
img_mode: RGB
|
| 117 |
+
- ABINetLabelEncode:
|
| 118 |
+
ignore_index: *ignore_index
|
| 119 |
+
- KeepKeys:
|
| 120 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 121 |
+
sampler:
|
| 122 |
+
name: RatioSampler
|
| 123 |
+
scales: [[128, 32]] # w, h
|
| 124 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 125 |
+
first_bs: *bs
|
| 126 |
+
fix_bs: false
|
| 127 |
+
divided_factor: [4, 16] # w, h
|
| 128 |
+
is_training: False
|
| 129 |
+
loader:
|
| 130 |
+
shuffle: False
|
| 131 |
+
drop_last: False
|
| 132 |
+
batch_size_per_card: *bs
|
| 133 |
+
max_ratio: *max_ratio
|
| 134 |
+
num_workers: 4
|
configs/rec/busnet/vit_busnet.yml
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 10
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/vit_busnet/
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: True
|
| 10 |
+
pretrained_model:
|
| 11 |
+
checkpoints:
|
| 12 |
+
use_tensorboard: false
|
| 13 |
+
infer_img:
|
| 14 |
+
# for data or label process
|
| 15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 16 |
+
max_text_length: 25
|
| 17 |
+
use_space_char: False
|
| 18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_vit_busnet.txt
|
| 19 |
+
grad_clip_val: 20
|
| 20 |
+
use_amp: True
|
| 21 |
+
|
| 22 |
+
Optimizer:
|
| 23 |
+
name: Adam
|
| 24 |
+
lr: 0.00053 # 4gpus bs256/gpu
|
| 25 |
+
weight_decay: 0.0
|
| 26 |
+
filter_bias_and_bn: False
|
| 27 |
+
|
| 28 |
+
LRScheduler:
|
| 29 |
+
name: MultiStepLR
|
| 30 |
+
milestones: [6]
|
| 31 |
+
gamma: 0.1
|
| 32 |
+
|
| 33 |
+
Architecture:
|
| 34 |
+
model_type: rec
|
| 35 |
+
algorithm: BUSBet
|
| 36 |
+
Transform:
|
| 37 |
+
Encoder:
|
| 38 |
+
name: ViT
|
| 39 |
+
img_size: [32,128]
|
| 40 |
+
patch_size: [4, 8]
|
| 41 |
+
embed_dim: 384
|
| 42 |
+
depth: 12
|
| 43 |
+
num_heads: 6
|
| 44 |
+
mlp_ratio: 4
|
| 45 |
+
qkv_bias: True
|
| 46 |
+
Decoder:
|
| 47 |
+
name: BUSDecoder
|
| 48 |
+
nhead: 6
|
| 49 |
+
num_layers: 6
|
| 50 |
+
dim_feedforward: 1536
|
| 51 |
+
ignore_index: &ignore_index 100
|
| 52 |
+
pretraining: False
|
| 53 |
+
Loss:
|
| 54 |
+
name: ABINetLoss
|
| 55 |
+
ignore_index: *ignore_index
|
| 56 |
+
|
| 57 |
+
PostProcess:
|
| 58 |
+
name: ABINetLabelDecode
|
| 59 |
+
|
| 60 |
+
Metric:
|
| 61 |
+
name: RecMetric
|
| 62 |
+
main_indicator: acc
|
| 63 |
+
is_filter: True
|
| 64 |
+
|
| 65 |
+
Train:
|
| 66 |
+
dataset:
|
| 67 |
+
name: LMDBDataSet
|
| 68 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
| 69 |
+
transforms:
|
| 70 |
+
- DecodeImagePIL: # load image
|
| 71 |
+
img_mode: RGB
|
| 72 |
+
- PARSeqAugPIL:
|
| 73 |
+
- ABINetLabelEncode:
|
| 74 |
+
ignore_index: *ignore_index
|
| 75 |
+
- RecTVResize:
|
| 76 |
+
image_shape: [32, 128]
|
| 77 |
+
padding: False
|
| 78 |
+
- KeepKeys:
|
| 79 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 80 |
+
loader:
|
| 81 |
+
shuffle: True
|
| 82 |
+
batch_size_per_card: 256
|
| 83 |
+
drop_last: True
|
| 84 |
+
num_workers: 4
|
| 85 |
+
|
| 86 |
+
Eval:
|
| 87 |
+
dataset:
|
| 88 |
+
name: LMDBDataSet
|
| 89 |
+
data_dir: ../evaluation
|
| 90 |
+
transforms:
|
| 91 |
+
- DecodeImagePIL: # load image
|
| 92 |
+
img_mode: RGB
|
| 93 |
+
- ABINetLabelEncode:
|
| 94 |
+
ignore_index: *ignore_index
|
| 95 |
+
- RecTVResize:
|
| 96 |
+
image_shape: [32, 128]
|
| 97 |
+
padding: False
|
| 98 |
+
- KeepKeys:
|
| 99 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 100 |
+
loader:
|
| 101 |
+
shuffle: False
|
| 102 |
+
drop_last: False
|
| 103 |
+
batch_size_per_card: 256
|
| 104 |
+
num_workers: 2
|
configs/rec/busnet/vit_busnet_pretraining.yml
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 10
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/vit_busnet_pretraining/
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: True
|
| 10 |
+
pretrained_model:
|
| 11 |
+
checkpoints:
|
| 12 |
+
use_tensorboard: false
|
| 13 |
+
infer_img:
|
| 14 |
+
# for data or label process
|
| 15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 16 |
+
max_text_length: 25
|
| 17 |
+
use_space_char: False
|
| 18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_vit_busnet_pretraining.txt
|
| 19 |
+
grad_clip_val: 20
|
| 20 |
+
use_amp: True
|
| 21 |
+
|
| 22 |
+
Optimizer:
|
| 23 |
+
name: Adam
|
| 24 |
+
lr: 0.00053 # 4gpus bs256/gpu
|
| 25 |
+
weight_decay: 0.0
|
| 26 |
+
filter_bias_and_bn: False
|
| 27 |
+
|
| 28 |
+
LRScheduler:
|
| 29 |
+
name: MultiStepLR
|
| 30 |
+
milestones: [6]
|
| 31 |
+
gamma: 0.1
|
| 32 |
+
|
| 33 |
+
Architecture:
|
| 34 |
+
model_type: rec
|
| 35 |
+
algorithm: BUSBet
|
| 36 |
+
Transform:
|
| 37 |
+
Encoder:
|
| 38 |
+
name: ViT
|
| 39 |
+
img_size: [32,128]
|
| 40 |
+
patch_size: [4, 8]
|
| 41 |
+
embed_dim: 384
|
| 42 |
+
depth: 12
|
| 43 |
+
num_heads: 6
|
| 44 |
+
mlp_ratio: 4
|
| 45 |
+
qkv_bias: True
|
| 46 |
+
Decoder:
|
| 47 |
+
name: BUSDecoder
|
| 48 |
+
nhead: 6
|
| 49 |
+
num_layers: 6
|
| 50 |
+
dim_feedforward: 1536
|
| 51 |
+
ignore_index: &ignore_index 100
|
| 52 |
+
pretraining: True
|
| 53 |
+
Loss:
|
| 54 |
+
name: ABINetLoss
|
| 55 |
+
ignore_index: *ignore_index
|
| 56 |
+
|
| 57 |
+
PostProcess:
|
| 58 |
+
name: ABINetLabelDecode
|
| 59 |
+
|
| 60 |
+
Metric:
|
| 61 |
+
name: RecMetric
|
| 62 |
+
main_indicator: acc
|
| 63 |
+
is_filter: True
|
| 64 |
+
|
| 65 |
+
Train:
|
| 66 |
+
dataset:
|
| 67 |
+
name: LMDBDataSet
|
| 68 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
| 69 |
+
transforms:
|
| 70 |
+
- DecodeImagePIL: # load image
|
| 71 |
+
img_mode: RGB
|
| 72 |
+
- PARSeqAugPIL:
|
| 73 |
+
- ABINetLabelEncode:
|
| 74 |
+
ignore_index: *ignore_index
|
| 75 |
+
- RecTVResize:
|
| 76 |
+
image_shape: [32, 128]
|
| 77 |
+
padding: False
|
| 78 |
+
- KeepKeys:
|
| 79 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 80 |
+
loader:
|
| 81 |
+
shuffle: True
|
| 82 |
+
batch_size_per_card: 256
|
| 83 |
+
drop_last: True
|
| 84 |
+
num_workers: 4
|
| 85 |
+
|
| 86 |
+
Eval:
|
| 87 |
+
dataset:
|
| 88 |
+
name: LMDBDataSet
|
| 89 |
+
data_dir: ../evaluation
|
| 90 |
+
transforms:
|
| 91 |
+
- DecodeImagePIL: # load image
|
| 92 |
+
img_mode: RGB
|
| 93 |
+
- ABINetLabelEncode:
|
| 94 |
+
ignore_index: *ignore_index
|
| 95 |
+
- RecTVResize:
|
| 96 |
+
image_shape: [32, 128]
|
| 97 |
+
padding: False
|
| 98 |
+
- KeepKeys:
|
| 99 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 100 |
+
loader:
|
| 101 |
+
shuffle: False
|
| 102 |
+
drop_last: False
|
| 103 |
+
batch_size_per_card: 256
|
| 104 |
+
num_workers: 2
|
configs/rec/cam/convnextv2_cam_tps_on.yml
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/convnextv2_cam_tps_on
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: False
|
| 10 |
+
pretrained_model:
|
| 11 |
+
checkpoints:
|
| 12 |
+
use_tensorboard: false
|
| 13 |
+
infer_img:
|
| 14 |
+
# for data or label process
|
| 15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 16 |
+
max_text_length: &max_text_length 25
|
| 17 |
+
use_space_char: False
|
| 18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_convnextv2_cam_tps_on.txt
|
| 19 |
+
use_amp: True
|
| 20 |
+
|
| 21 |
+
Optimizer:
|
| 22 |
+
name: AdamW
|
| 23 |
+
lr: 0.0008 # for 4gpus bs256/gpu
|
| 24 |
+
weight_decay: 0.05
|
| 25 |
+
filter_bias_and_bn: True
|
| 26 |
+
eps: 1.e-8
|
| 27 |
+
|
| 28 |
+
LRScheduler:
|
| 29 |
+
name: OneCycleLR
|
| 30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 : 1.5ep
|
| 31 |
+
cycle_momentum: False
|
| 32 |
+
|
| 33 |
+
Architecture:
|
| 34 |
+
model_type: rec
|
| 35 |
+
algorithm: CAM
|
| 36 |
+
Transform:
|
| 37 |
+
name: Aster_TPS
|
| 38 |
+
tps_inputsize: [32, 64]
|
| 39 |
+
tps_outputsize: &img_shape [32, 128]
|
| 40 |
+
Encoder:
|
| 41 |
+
name: CAMEncoder
|
| 42 |
+
encoder_config:
|
| 43 |
+
name: ConvNeXtV2
|
| 44 |
+
depths: [2, 2, 8, 2]
|
| 45 |
+
dims: [80, 160, 320, 640]
|
| 46 |
+
strides: [[4,4], [2,1], [2,1], [1,1]]
|
| 47 |
+
drop_path_rate: 0.2
|
| 48 |
+
feat2d: True
|
| 49 |
+
nb_classes: 97
|
| 50 |
+
strides: [[4,4], [2,1], [2,1], [1,1]]
|
| 51 |
+
deform_stride: 2
|
| 52 |
+
stage_idx: 2
|
| 53 |
+
use_depthwise_unet: True
|
| 54 |
+
use_more_unet: False
|
| 55 |
+
binary_loss_type: BanlanceMultiClassCrossEntropyLoss
|
| 56 |
+
mid_size: True
|
| 57 |
+
d_embedding: 384
|
| 58 |
+
Decoder:
|
| 59 |
+
name: CAMDecoder
|
| 60 |
+
num_encoder_layers: -1
|
| 61 |
+
beam_size: 0
|
| 62 |
+
num_decoder_layers: 2
|
| 63 |
+
nhead: 8
|
| 64 |
+
max_len: *max_text_length
|
| 65 |
+
|
| 66 |
+
Loss:
|
| 67 |
+
name: CAMLoss
|
| 68 |
+
loss_weight_binary: 1.5
|
| 69 |
+
label_smoothing: 0.
|
| 70 |
+
|
| 71 |
+
Metric:
|
| 72 |
+
name: RecMetric
|
| 73 |
+
main_indicator: acc
|
| 74 |
+
is_filter: True
|
| 75 |
+
|
| 76 |
+
PostProcess:
|
| 77 |
+
name: ARLabelDecode
|
| 78 |
+
|
| 79 |
+
Train:
|
| 80 |
+
dataset:
|
| 81 |
+
name: LMDBDataSet
|
| 82 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
| 83 |
+
transforms:
|
| 84 |
+
- DecodeImagePIL: # load image
|
| 85 |
+
img_mode: RGB
|
| 86 |
+
- PARSeqAugPIL:
|
| 87 |
+
- CAMLabelEncode: # Class handling label
|
| 88 |
+
font_path: ./arial.ttf
|
| 89 |
+
image_shape: *img_shape
|
| 90 |
+
- RecTVResize:
|
| 91 |
+
image_shape: [64, 256]
|
| 92 |
+
padding: False
|
| 93 |
+
- KeepKeys:
|
| 94 |
+
keep_keys: ['image', 'label', 'length', 'binary_mask'] # dataloader will return list in this order
|
| 95 |
+
loader:
|
| 96 |
+
shuffle: True
|
| 97 |
+
batch_size_per_card: 256
|
| 98 |
+
drop_last: True
|
| 99 |
+
num_workers: 4
|
| 100 |
+
|
| 101 |
+
Eval:
|
| 102 |
+
dataset:
|
| 103 |
+
name: LMDBDataSet
|
| 104 |
+
data_dir: ../evaluation
|
| 105 |
+
transforms:
|
| 106 |
+
- DecodeImagePIL: # load image
|
| 107 |
+
img_mode: RGB
|
| 108 |
+
- ARLabelEncode: # Class handling label
|
| 109 |
+
- RecTVResize:
|
| 110 |
+
image_shape: [64, 256]
|
| 111 |
+
padding: False
|
| 112 |
+
- KeepKeys:
|
| 113 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 114 |
+
loader:
|
| 115 |
+
shuffle: False
|
| 116 |
+
drop_last: False
|
| 117 |
+
batch_size_per_card: 256
|
| 118 |
+
num_workers: 2
|
configs/rec/cam/convnextv2_tiny_cam_tps_on.yml
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/convnextv2_tiny_cam_tps_on
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: False
|
| 10 |
+
pretrained_model:
|
| 11 |
+
checkpoints:
|
| 12 |
+
use_tensorboard: false
|
| 13 |
+
infer_img:
|
| 14 |
+
# for data or label process
|
| 15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 16 |
+
max_text_length: &max_text_length 25
|
| 17 |
+
use_space_char: False
|
| 18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_convnextv2_cam_tps_on.txt
|
| 19 |
+
use_amp: True
|
| 20 |
+
|
| 21 |
+
Optimizer:
|
| 22 |
+
name: AdamW
|
| 23 |
+
lr: 0.0008 # for 4gpus bs256/gpu
|
| 24 |
+
weight_decay: 0.05
|
| 25 |
+
filter_bias_and_bn: True
|
| 26 |
+
eps: 1.e-8
|
| 27 |
+
|
| 28 |
+
LRScheduler:
|
| 29 |
+
name: OneCycleLR
|
| 30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 : 1.5ep
|
| 31 |
+
cycle_momentum: False
|
| 32 |
+
|
| 33 |
+
Architecture:
|
| 34 |
+
model_type: rec
|
| 35 |
+
algorithm: CAM
|
| 36 |
+
Transform:
|
| 37 |
+
name: Aster_TPS
|
| 38 |
+
tps_inputsize: [32, 64]
|
| 39 |
+
tps_outputsize: &img_shape [32, 128]
|
| 40 |
+
Encoder:
|
| 41 |
+
name: CAMEncoder
|
| 42 |
+
encoder_config:
|
| 43 |
+
name: ConvNeXtV2
|
| 44 |
+
depths: [3, 3, 9, 3]
|
| 45 |
+
dims: [96, 192, 384, 768]
|
| 46 |
+
strides: [[4,4], [2,1], [2,1], [1,1]]
|
| 47 |
+
drop_path_rate: 0.2
|
| 48 |
+
feat2d: True
|
| 49 |
+
nb_classes: 97
|
| 50 |
+
strides: [[4,4], [2,1], [2,1], [1,1]]
|
| 51 |
+
deform_stride: 2
|
| 52 |
+
stage_idx: 2
|
| 53 |
+
use_depthwise_unet: True
|
| 54 |
+
use_more_unet: False
|
| 55 |
+
binary_loss_type: BanlanceMultiClassCrossEntropyLoss
|
| 56 |
+
mid_size: False
|
| 57 |
+
d_embedding: 512
|
| 58 |
+
Decoder:
|
| 59 |
+
name: CAMDecoder
|
| 60 |
+
num_encoder_layers: -1
|
| 61 |
+
beam_size: 0
|
| 62 |
+
num_decoder_layers: 2
|
| 63 |
+
nhead: 8
|
| 64 |
+
max_len: *max_text_length
|
| 65 |
+
|
| 66 |
+
Loss:
|
| 67 |
+
name: CAMLoss
|
| 68 |
+
loss_weight_binary: 1.5
|
| 69 |
+
label_smoothing: 0.
|
| 70 |
+
|
| 71 |
+
Metric:
|
| 72 |
+
name: RecMetric
|
| 73 |
+
main_indicator: acc
|
| 74 |
+
is_filter: True
|
| 75 |
+
|
| 76 |
+
PostProcess:
|
| 77 |
+
name: ARLabelDecode
|
| 78 |
+
|
| 79 |
+
Train:
|
| 80 |
+
dataset:
|
| 81 |
+
name: LMDBDataSet
|
| 82 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
| 83 |
+
transforms:
|
| 84 |
+
- DecodeImagePIL: # load image
|
| 85 |
+
img_mode: RGB
|
| 86 |
+
- PARSeqAugPIL:
|
| 87 |
+
- CAMLabelEncode: # Class handling label
|
| 88 |
+
font_path: ./arial.ttf
|
| 89 |
+
image_shape: *img_shape
|
| 90 |
+
- RecTVResize:
|
| 91 |
+
image_shape: [64, 256]
|
| 92 |
+
padding: False
|
| 93 |
+
- KeepKeys:
|
| 94 |
+
keep_keys: ['image', 'label', 'length', 'binary_mask'] # dataloader will return list in this order
|
| 95 |
+
loader:
|
| 96 |
+
shuffle: True
|
| 97 |
+
batch_size_per_card: 256
|
| 98 |
+
drop_last: True
|
| 99 |
+
num_workers: 4
|
| 100 |
+
|
| 101 |
+
Eval:
|
| 102 |
+
dataset:
|
| 103 |
+
name: LMDBDataSet
|
| 104 |
+
data_dir: ../evaluation
|
| 105 |
+
transforms:
|
| 106 |
+
- DecodeImagePIL: # load image
|
| 107 |
+
img_mode: RGB
|
| 108 |
+
- ARLabelEncode: # Class handling label
|
| 109 |
+
- RecTVResize:
|
| 110 |
+
image_shape: [64, 256]
|
| 111 |
+
padding: False
|
| 112 |
+
- KeepKeys:
|
| 113 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 114 |
+
loader:
|
| 115 |
+
shuffle: False
|
| 116 |
+
drop_last: False
|
| 117 |
+
batch_size_per_card: 256
|
| 118 |
+
num_workers: 2
|
configs/rec/cam/svtrv2_cam_tps_on.yml
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_cam_tps_on
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: False
|
| 10 |
+
pretrained_model:
|
| 11 |
+
checkpoints:
|
| 12 |
+
use_tensorboard: false
|
| 13 |
+
infer_img:
|
| 14 |
+
# for data or label process
|
| 15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 16 |
+
max_text_length: &max_text_length 25
|
| 17 |
+
use_space_char: False
|
| 18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_cam_tps_on.txt
|
| 19 |
+
use_amp: True
|
| 20 |
+
|
| 21 |
+
Optimizer:
|
| 22 |
+
name: AdamW
|
| 23 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
| 24 |
+
weight_decay: 0.05
|
| 25 |
+
filter_bias_and_bn: True
|
| 26 |
+
|
| 27 |
+
LRScheduler:
|
| 28 |
+
name: OneCycleLR
|
| 29 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 : 1.5ep
|
| 30 |
+
cycle_momentum: False
|
| 31 |
+
|
| 32 |
+
Architecture:
|
| 33 |
+
model_type: rec
|
| 34 |
+
algorithm: CAM
|
| 35 |
+
Transform:
|
| 36 |
+
name: Aster_TPS
|
| 37 |
+
tps_inputsize: [32, 64]
|
| 38 |
+
tps_outputsize: &img_shape [32, 128]
|
| 39 |
+
Encoder:
|
| 40 |
+
name: CAMEncoder
|
| 41 |
+
encoder_config:
|
| 42 |
+
name: SVTRv2LNConvTwo33
|
| 43 |
+
use_pos_embed: False
|
| 44 |
+
dims: [128, 256, 384]
|
| 45 |
+
depths: [6, 6, 6]
|
| 46 |
+
num_heads: [4, 8, 12]
|
| 47 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
| 48 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
| 49 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
| 50 |
+
last_stage: false
|
| 51 |
+
feat2d: True
|
| 52 |
+
nb_classes: 97
|
| 53 |
+
strides: [[4, 4], [1, 1], [2, 1], [1, 1]]
|
| 54 |
+
k_size: [[2, 2], [1, 1], [2, 1], [1, 1]]
|
| 55 |
+
q_size: [4, 32]
|
| 56 |
+
deform_stride: 2
|
| 57 |
+
stage_idx: 2
|
| 58 |
+
use_depthwise_unet: True
|
| 59 |
+
use_more_unet: False
|
| 60 |
+
binary_loss_type: BanlanceMultiClassCrossEntropyLoss
|
| 61 |
+
mid_size: True
|
| 62 |
+
d_embedding: 384
|
| 63 |
+
Decoder:
|
| 64 |
+
name: CAMDecoder
|
| 65 |
+
num_encoder_layers: -1
|
| 66 |
+
beam_size: 0
|
| 67 |
+
num_decoder_layers: 2
|
| 68 |
+
nhead: 8
|
| 69 |
+
max_len: *max_text_length
|
| 70 |
+
|
| 71 |
+
Loss:
|
| 72 |
+
name: CAMLoss
|
| 73 |
+
loss_weight_binary: 1.5
|
| 74 |
+
label_smoothing: 0.
|
| 75 |
+
|
| 76 |
+
Metric:
|
| 77 |
+
name: RecMetric
|
| 78 |
+
main_indicator: acc
|
| 79 |
+
is_filter: True
|
| 80 |
+
|
| 81 |
+
PostProcess:
|
| 82 |
+
name: ARLabelDecode
|
| 83 |
+
|
| 84 |
+
Train:
|
| 85 |
+
dataset:
|
| 86 |
+
name: LMDBDataSet
|
| 87 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
| 88 |
+
transforms:
|
| 89 |
+
- DecodeImagePIL: # load image
|
| 90 |
+
img_mode: RGB
|
| 91 |
+
- PARSeqAugPIL:
|
| 92 |
+
- CAMLabelEncode: # Class handling label
|
| 93 |
+
font_path: ./arial.ttf
|
| 94 |
+
image_shape: *img_shape
|
| 95 |
+
- RecTVResize:
|
| 96 |
+
image_shape: [64, 256]
|
| 97 |
+
padding: False
|
| 98 |
+
- KeepKeys:
|
| 99 |
+
keep_keys: ['image', 'label', 'length', 'binary_mask'] # dataloader will return list in this order
|
| 100 |
+
loader:
|
| 101 |
+
shuffle: True
|
| 102 |
+
batch_size_per_card: 256
|
| 103 |
+
drop_last: True
|
| 104 |
+
num_workers: 4
|
| 105 |
+
|
| 106 |
+
Eval:
|
| 107 |
+
dataset:
|
| 108 |
+
name: LMDBDataSet
|
| 109 |
+
data_dir: ../evaluation
|
| 110 |
+
transforms:
|
| 111 |
+
- DecodeImagePIL: # load image
|
| 112 |
+
img_mode: RGB
|
| 113 |
+
- ARLabelEncode: # Class handling label
|
| 114 |
+
- RecTVResize:
|
| 115 |
+
image_shape: [64, 256]
|
| 116 |
+
padding: False
|
| 117 |
+
- KeepKeys:
|
| 118 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 119 |
+
loader:
|
| 120 |
+
shuffle: False
|
| 121 |
+
drop_last: False
|
| 122 |
+
batch_size_per_card: 256
|
| 123 |
+
num_workers: 2
|
configs/rec/cdistnet/resnet45_trans_cdistnet.yml
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/resnet45_trans_cdistnet
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: True
|
| 10 |
+
pretrained_model:
|
| 11 |
+
checkpoints:
|
| 12 |
+
use_tensorboard: false
|
| 13 |
+
infer_img:
|
| 14 |
+
# for data or label process
|
| 15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 16 |
+
max_text_length: 25
|
| 17 |
+
use_space_char: False
|
| 18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_cdistnet.txt
|
| 19 |
+
use_amp: True
|
| 20 |
+
grad_clip_val: 5
|
| 21 |
+
|
| 22 |
+
Optimizer:
|
| 23 |
+
name: Adam
|
| 24 |
+
lr: 0.002 # for 4gpus bs256/gpu
|
| 25 |
+
weight_decay: 0.0
|
| 26 |
+
filter_bias_and_bn: False
|
| 27 |
+
|
| 28 |
+
LRScheduler:
|
| 29 |
+
name: OneCycleLR
|
| 30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
| 31 |
+
cycle_momentum: False
|
| 32 |
+
|
| 33 |
+
Architecture:
|
| 34 |
+
model_type: rec
|
| 35 |
+
algorithm: CDistNet
|
| 36 |
+
Transform:
|
| 37 |
+
Encoder:
|
| 38 |
+
name: ResNet45
|
| 39 |
+
in_channels: 3
|
| 40 |
+
strides: [2, 1, 2, 1, 1]
|
| 41 |
+
Decoder:
|
| 42 |
+
name: CDistNetDecoder
|
| 43 |
+
add_conv: True
|
| 44 |
+
|
| 45 |
+
Loss:
|
| 46 |
+
name: ARLoss
|
| 47 |
+
|
| 48 |
+
PostProcess:
|
| 49 |
+
name: ARLabelDecode
|
| 50 |
+
|
| 51 |
+
Metric:
|
| 52 |
+
name: RecMetric
|
| 53 |
+
main_indicator: acc
|
| 54 |
+
is_filter: True
|
| 55 |
+
|
| 56 |
+
Train:
|
| 57 |
+
dataset:
|
| 58 |
+
name: LMDBDataSet
|
| 59 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
| 60 |
+
transforms:
|
| 61 |
+
- DecodeImagePIL: # load image
|
| 62 |
+
img_mode: RGB
|
| 63 |
+
- PARSeqAugPIL:
|
| 64 |
+
- ARLabelEncode: # Class handling label
|
| 65 |
+
- RecTVResize:
|
| 66 |
+
image_shape: [32, 128]
|
| 67 |
+
padding: False
|
| 68 |
+
- KeepKeys:
|
| 69 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 70 |
+
loader:
|
| 71 |
+
shuffle: True
|
| 72 |
+
batch_size_per_card: 256
|
| 73 |
+
drop_last: True
|
| 74 |
+
num_workers: 4
|
| 75 |
+
|
| 76 |
+
Eval:
|
| 77 |
+
dataset:
|
| 78 |
+
name: LMDBDataSet
|
| 79 |
+
data_dir: ../evaluation
|
| 80 |
+
transforms:
|
| 81 |
+
- DecodeImagePIL: # load image
|
| 82 |
+
img_mode: RGB
|
| 83 |
+
- ARLabelEncode: # Class handling label
|
| 84 |
+
- RecTVResize:
|
| 85 |
+
image_shape: [32, 128]
|
| 86 |
+
padding: False
|
| 87 |
+
- KeepKeys:
|
| 88 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 89 |
+
loader:
|
| 90 |
+
shuffle: False
|
| 91 |
+
drop_last: False
|
| 92 |
+
batch_size_per_card: 256
|
| 93 |
+
num_workers: 2
|
configs/rec/cdistnet/svtrv2_cdistnet.yml
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_cdistnet/
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: True
|
| 10 |
+
pretrained_model:
|
| 11 |
+
checkpoints:
|
| 12 |
+
use_tensorboard: false
|
| 13 |
+
infer_img:
|
| 14 |
+
# for data or label process
|
| 15 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
| 16 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
| 17 |
+
max_text_length: &max_text_length 25
|
| 18 |
+
use_space_char: &use_space_char False
|
| 19 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_cdistnet.txt
|
| 20 |
+
use_amp: True
|
| 21 |
+
|
| 22 |
+
Optimizer:
|
| 23 |
+
name: AdamW
|
| 24 |
+
lr: 0.00065 #4gpus bs256/gpu
|
| 25 |
+
weight_decay: 0.05
|
| 26 |
+
filter_bias_and_bn: True
|
| 27 |
+
|
| 28 |
+
LRScheduler:
|
| 29 |
+
name: OneCycleLR
|
| 30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
| 31 |
+
cycle_momentum: False
|
| 32 |
+
|
| 33 |
+
Architecture:
|
| 34 |
+
model_type: rec
|
| 35 |
+
algorithm: CDistNet
|
| 36 |
+
in_channels: 3
|
| 37 |
+
Transform:
|
| 38 |
+
Encoder:
|
| 39 |
+
name: SVTRv2LNConvTwo33
|
| 40 |
+
use_pos_embed: False
|
| 41 |
+
out_channels: 256
|
| 42 |
+
dims: [128, 256, 384]
|
| 43 |
+
depths: [6, 6, 6]
|
| 44 |
+
num_heads: [4, 8, 12]
|
| 45 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
| 46 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
| 47 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
| 48 |
+
last_stage: false
|
| 49 |
+
feat2d: True
|
| 50 |
+
Decoder:
|
| 51 |
+
name: CDistNetDecoder
|
| 52 |
+
add_conv: False
|
| 53 |
+
num_encoder_blocks: 0
|
| 54 |
+
|
| 55 |
+
Loss:
|
| 56 |
+
name: ARLoss
|
| 57 |
+
|
| 58 |
+
PostProcess:
|
| 59 |
+
name: ARLabelDecode
|
| 60 |
+
character_dict_path: *character_dict_path
|
| 61 |
+
use_space_char: *use_space_char
|
| 62 |
+
|
| 63 |
+
Metric:
|
| 64 |
+
name: RecMetric
|
| 65 |
+
main_indicator: acc
|
| 66 |
+
is_filter: True
|
| 67 |
+
|
| 68 |
+
Train:
|
| 69 |
+
dataset:
|
| 70 |
+
name: RatioDataSetTVResize
|
| 71 |
+
ds_width: True
|
| 72 |
+
padding: false
|
| 73 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
| 74 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
| 75 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
| 76 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
| 77 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
| 78 |
+
]
|
| 79 |
+
transforms:
|
| 80 |
+
- DecodeImagePIL: # load image
|
| 81 |
+
img_mode: RGB
|
| 82 |
+
- PARSeqAugPIL:
|
| 83 |
+
- ARLabelEncode: # Class handling label
|
| 84 |
+
character_dict_path: *character_dict_path
|
| 85 |
+
use_space_char: *use_space_char
|
| 86 |
+
max_text_length: *max_text_length
|
| 87 |
+
- KeepKeys:
|
| 88 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 89 |
+
sampler:
|
| 90 |
+
name: RatioSampler
|
| 91 |
+
scales: [[128, 32]] # w, h
|
| 92 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 93 |
+
first_bs: &bs 256
|
| 94 |
+
fix_bs: false
|
| 95 |
+
divided_factor: [4, 16] # w, h
|
| 96 |
+
is_training: True
|
| 97 |
+
loader:
|
| 98 |
+
shuffle: True
|
| 99 |
+
batch_size_per_card: *bs
|
| 100 |
+
drop_last: True
|
| 101 |
+
max_ratio: &max_ratio 4
|
| 102 |
+
num_workers: 4
|
| 103 |
+
|
| 104 |
+
Eval:
|
| 105 |
+
dataset:
|
| 106 |
+
name: RatioDataSetTVResize
|
| 107 |
+
ds_width: True
|
| 108 |
+
padding: False
|
| 109 |
+
data_dir_list: [
|
| 110 |
+
'../evaluation/CUTE80',
|
| 111 |
+
'../evaluation/IC13_857',
|
| 112 |
+
'../evaluation/IC15_1811',
|
| 113 |
+
'../evaluation/IIIT5k',
|
| 114 |
+
'../evaluation/SVT',
|
| 115 |
+
'../evaluation/SVTP',
|
| 116 |
+
]
|
| 117 |
+
transforms:
|
| 118 |
+
- DecodeImagePIL: # load image
|
| 119 |
+
img_mode: RGB
|
| 120 |
+
- ARLabelEncode: # Class handling label
|
| 121 |
+
character_dict_path: *character_dict_path
|
| 122 |
+
use_space_char: *use_space_char
|
| 123 |
+
max_text_length: *max_text_length
|
| 124 |
+
- KeepKeys:
|
| 125 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 126 |
+
sampler:
|
| 127 |
+
name: RatioSampler
|
| 128 |
+
scales: [[128, 32]] # w, h
|
| 129 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 130 |
+
first_bs: *bs
|
| 131 |
+
fix_bs: false
|
| 132 |
+
divided_factor: [4, 16] # w, h
|
| 133 |
+
is_training: False
|
| 134 |
+
loader:
|
| 135 |
+
shuffle: False
|
| 136 |
+
drop_last: False
|
| 137 |
+
batch_size_per_card: *bs
|
| 138 |
+
max_ratio: *max_ratio
|
| 139 |
+
num_workers: 4
|
configs/rec/cppd/svtr_base_cppd.yml
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/svtr_base_cppd/
|
| 7 |
+
save_epoch_step: [15, 1]
|
| 8 |
+
# evaluation is run every 2000 iterations
|
| 9 |
+
eval_batch_step: [0, 500]
|
| 10 |
+
eval_epoch_step: [0, 1]
|
| 11 |
+
cal_metric_during_train: True
|
| 12 |
+
pretrained_model:
|
| 13 |
+
checkpoints:
|
| 14 |
+
use_tensorboard: false
|
| 15 |
+
infer_img:
|
| 16 |
+
# for data or label process
|
| 17 |
+
character_dict_path: &character_dict_path
|
| 18 |
+
# ./tools/utils/EN_symbol_dict.txt # 96en
|
| 19 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
| 20 |
+
max_text_length: &max_text_length 25
|
| 21 |
+
use_space_char: &use_space_char False
|
| 22 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_cppd.txt
|
| 23 |
+
use_amp: True
|
| 24 |
+
|
| 25 |
+
Optimizer:
|
| 26 |
+
name: AdamW
|
| 27 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
| 28 |
+
weight_decay: 0.05
|
| 29 |
+
filter_bias_and_bn: True
|
| 30 |
+
|
| 31 |
+
LRScheduler:
|
| 32 |
+
name: OneCycleLR
|
| 33 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
| 34 |
+
cycle_momentum: False
|
| 35 |
+
|
| 36 |
+
Architecture:
|
| 37 |
+
model_type: rec
|
| 38 |
+
algorithm: CPPD
|
| 39 |
+
in_channels: 3
|
| 40 |
+
Transform:
|
| 41 |
+
Encoder:
|
| 42 |
+
name: SVTRNet
|
| 43 |
+
img_size: [32, 128]
|
| 44 |
+
out_char_num: 25
|
| 45 |
+
out_channels: 256
|
| 46 |
+
patch_merging: 'Conv'
|
| 47 |
+
embed_dim: [128, 256, 384]
|
| 48 |
+
depth: [6, 6, 6]
|
| 49 |
+
num_heads: [4, 8, 12]
|
| 50 |
+
mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
|
| 51 |
+
local_mixer: [[5, 5], [5, 5], [5, 5]]
|
| 52 |
+
last_stage: False
|
| 53 |
+
prenorm: True
|
| 54 |
+
Decoder:
|
| 55 |
+
name: CPPDDecoder
|
| 56 |
+
vis_seq: 64
|
| 57 |
+
num_layer: 2
|
| 58 |
+
pos_len: False
|
| 59 |
+
rec_layer: 1
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
Loss:
|
| 63 |
+
name: CPPDLoss
|
| 64 |
+
ignore_index: 100
|
| 65 |
+
smoothing: True
|
| 66 |
+
pos_len: False
|
| 67 |
+
sideloss_weight: 1.0
|
| 68 |
+
|
| 69 |
+
PostProcess:
|
| 70 |
+
name: CPPDLabelDecode
|
| 71 |
+
character_dict_path: *character_dict_path
|
| 72 |
+
use_space_char: *use_space_char
|
| 73 |
+
|
| 74 |
+
Metric:
|
| 75 |
+
name: RecMetric
|
| 76 |
+
main_indicator: acc
|
| 77 |
+
|
| 78 |
+
Train:
|
| 79 |
+
dataset:
|
| 80 |
+
name: LMDBDataSet
|
| 81 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
| 82 |
+
transforms:
|
| 83 |
+
- DecodeImagePIL: # load image
|
| 84 |
+
img_mode: RGB
|
| 85 |
+
- PARSeqAugPIL:
|
| 86 |
+
- CPPDLabelEncode: # Class handling label
|
| 87 |
+
pos_len: False
|
| 88 |
+
character_dict_path: *character_dict_path
|
| 89 |
+
use_space_char: *use_space_char
|
| 90 |
+
max_text_length: *max_text_length
|
| 91 |
+
- RecTVResize:
|
| 92 |
+
image_shape: [32, 128]
|
| 93 |
+
padding: False
|
| 94 |
+
- KeepKeys:
|
| 95 |
+
keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
|
| 96 |
+
loader:
|
| 97 |
+
shuffle: True
|
| 98 |
+
batch_size_per_card: 256
|
| 99 |
+
drop_last: True
|
| 100 |
+
num_workers: 4
|
| 101 |
+
|
| 102 |
+
Eval:
|
| 103 |
+
dataset:
|
| 104 |
+
name: LMDBDataSet
|
| 105 |
+
data_dir: ../evaluation/
|
| 106 |
+
transforms:
|
| 107 |
+
- DecodeImagePIL: # load image
|
| 108 |
+
img_mode: RGB
|
| 109 |
+
- CPPDLabelEncode: # Class handling label
|
| 110 |
+
pos_len: False
|
| 111 |
+
character_dict_path: *character_dict_path
|
| 112 |
+
use_space_char: *use_space_char
|
| 113 |
+
max_text_length: *max_text_length
|
| 114 |
+
- RecTVResize:
|
| 115 |
+
image_shape: [32, 128]
|
| 116 |
+
padding: False
|
| 117 |
+
- KeepKeys:
|
| 118 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 119 |
+
loader:
|
| 120 |
+
shuffle: False
|
| 121 |
+
drop_last: False
|
| 122 |
+
batch_size_per_card: 128
|
| 123 |
+
num_workers: 4
|
configs/rec/cppd/svtr_base_cppd_ch.yml
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 100
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/ch/svtr_base_cppd/
|
| 7 |
+
save_epoch_step: [15, 1]
|
| 8 |
+
# evaluation is run every 2000 iterations
|
| 9 |
+
eval_batch_step: [0, 2000]
|
| 10 |
+
eval_epoch_step: [0, 1]
|
| 11 |
+
cal_metric_during_train: False
|
| 12 |
+
pretrained_model:
|
| 13 |
+
checkpoints:
|
| 14 |
+
use_tensorboard: false
|
| 15 |
+
infer_img:
|
| 16 |
+
# for data or label process
|
| 17 |
+
character_dict_path: &character_dict_path ./tools/utils/ppocr_keys_v1.txt
|
| 18 |
+
# ./tools/utils/EN_symbol_dict.txt # 96en
|
| 19 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
| 20 |
+
max_text_length: &max_text_length 25
|
| 21 |
+
use_space_char: &use_space_char False
|
| 22 |
+
save_res_path: ./output/rec/ch/predicts_svtr_base_cppd.txt
|
| 23 |
+
use_amp: True
|
| 24 |
+
|
| 25 |
+
Optimizer:
|
| 26 |
+
name: AdamW
|
| 27 |
+
lr: 0.0005 # for 4gpus bs128/gpu
|
| 28 |
+
weight_decay: 0.05
|
| 29 |
+
filter_bias_and_bn: True
|
| 30 |
+
|
| 31 |
+
LRScheduler:
|
| 32 |
+
name: CosineAnnealingLR
|
| 33 |
+
warmup_epoch: 5
|
| 34 |
+
|
| 35 |
+
Architecture:
|
| 36 |
+
model_type: rec
|
| 37 |
+
algorithm: CPPD
|
| 38 |
+
in_channels: 3
|
| 39 |
+
Transform:
|
| 40 |
+
Encoder:
|
| 41 |
+
name: SVTRNet
|
| 42 |
+
img_size: [32, 256]
|
| 43 |
+
patch_merging: 'Conv'
|
| 44 |
+
embed_dim: [128, 256, 384]
|
| 45 |
+
depth: [6, 6, 4]
|
| 46 |
+
num_heads: [4, 8, 12]
|
| 47 |
+
mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
|
| 48 |
+
local_mixer: [[5, 5], [5, 5], [5, 5]]
|
| 49 |
+
last_stage: False
|
| 50 |
+
prenorm: True
|
| 51 |
+
Decoder:
|
| 52 |
+
name: CPPDDecoder
|
| 53 |
+
vis_seq: 128
|
| 54 |
+
num_layer: 3
|
| 55 |
+
pos_len: False
|
| 56 |
+
rec_layer: 1
|
| 57 |
+
ch: True
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
Loss:
|
| 61 |
+
name: CPPDLoss
|
| 62 |
+
ignore_index: 7000
|
| 63 |
+
smoothing: True
|
| 64 |
+
pos_len: False
|
| 65 |
+
sideloss_weight: 1.0
|
| 66 |
+
|
| 67 |
+
PostProcess:
|
| 68 |
+
name: CPPDLabelDecode
|
| 69 |
+
character_dict_path: *character_dict_path
|
| 70 |
+
use_space_char: *use_space_char
|
| 71 |
+
|
| 72 |
+
Metric:
|
| 73 |
+
name: RecMetric
|
| 74 |
+
main_indicator: acc
|
| 75 |
+
|
| 76 |
+
Train:
|
| 77 |
+
dataset:
|
| 78 |
+
name: LMDBDataSet
|
| 79 |
+
data_dir: ../benchmark_bctr/benchmark_bctr_train
|
| 80 |
+
transforms:
|
| 81 |
+
- DecodeImage: # load image
|
| 82 |
+
img_mode: BGR
|
| 83 |
+
channel_first: False
|
| 84 |
+
- CPPDLabelEncode: # Class handling label
|
| 85 |
+
pos_len: False
|
| 86 |
+
ch: True
|
| 87 |
+
ignore_index: 7000
|
| 88 |
+
character_dict_path: *character_dict_path
|
| 89 |
+
use_space_char: *use_space_char
|
| 90 |
+
max_text_length: *max_text_length
|
| 91 |
+
- SVTRResize:
|
| 92 |
+
image_shape: [3, 32, 256]
|
| 93 |
+
padding: True
|
| 94 |
+
- KeepKeys:
|
| 95 |
+
keep_keys: ['image', 'label', 'label_node', 'label_index', 'length'] # dataloader will return list in this order
|
| 96 |
+
loader:
|
| 97 |
+
shuffle: True
|
| 98 |
+
batch_size_per_card: 128
|
| 99 |
+
drop_last: True
|
| 100 |
+
num_workers: 8
|
| 101 |
+
|
| 102 |
+
Eval:
|
| 103 |
+
dataset:
|
| 104 |
+
name: LMDBDataSet
|
| 105 |
+
data_dir: ../benchmark_bctr/benchmark_bctr_test/scene_test
|
| 106 |
+
transforms:
|
| 107 |
+
- DecodeImage: # load image
|
| 108 |
+
img_mode: BGR
|
| 109 |
+
channel_first: False
|
| 110 |
+
- CPPDLabelEncode: # Class handling label
|
| 111 |
+
pos_len: False
|
| 112 |
+
ch: True
|
| 113 |
+
ignore_index: 7000
|
| 114 |
+
character_dict_path: *character_dict_path
|
| 115 |
+
use_space_char: *use_space_char
|
| 116 |
+
max_text_length: *max_text_length
|
| 117 |
+
- SVTRResize:
|
| 118 |
+
image_shape: [3, 32, 256]
|
| 119 |
+
padding: True
|
| 120 |
+
- KeepKeys:
|
| 121 |
+
keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
|
| 122 |
+
loader:
|
| 123 |
+
shuffle: False
|
| 124 |
+
drop_last: False
|
| 125 |
+
batch_size_per_card: 256
|
| 126 |
+
num_workers: 4
|
configs/rec/cppd/svtr_base_cppd_h8.yml
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/svtr_base_h8_cppd/
|
| 7 |
+
save_epoch_step: [15, 1]
|
| 8 |
+
# evaluation is run every 2000 iterations
|
| 9 |
+
eval_batch_step: [0, 500]
|
| 10 |
+
eval_epoch_step: [0, 1]
|
| 11 |
+
cal_metric_during_train: True
|
| 12 |
+
pretrained_model:
|
| 13 |
+
checkpoints:
|
| 14 |
+
use_tensorboard: false
|
| 15 |
+
infer_img:
|
| 16 |
+
# for data or label process
|
| 17 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
| 18 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
| 19 |
+
max_text_length: &max_text_length 25
|
| 20 |
+
use_space_char: &use_space_char False
|
| 21 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_cppd.txt
|
| 22 |
+
use_amp: True
|
| 23 |
+
|
| 24 |
+
Optimizer:
|
| 25 |
+
name: AdamW
|
| 26 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
| 27 |
+
weight_decay: 0.05
|
| 28 |
+
filter_bias_and_bn: True
|
| 29 |
+
|
| 30 |
+
LRScheduler:
|
| 31 |
+
name: OneCycleLR
|
| 32 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
| 33 |
+
cycle_momentum: False
|
| 34 |
+
|
| 35 |
+
Architecture:
|
| 36 |
+
model_type: rec
|
| 37 |
+
algorithm: CPPD
|
| 38 |
+
in_channels: 3
|
| 39 |
+
Transform:
|
| 40 |
+
Encoder:
|
| 41 |
+
name: SVTRNet
|
| 42 |
+
img_size: [32, 128]
|
| 43 |
+
out_char_num: 25
|
| 44 |
+
out_channels: 256
|
| 45 |
+
patch_merging: 'Conv'
|
| 46 |
+
embed_dim: [128, 256, 384]
|
| 47 |
+
depth: [6, 6, 6]
|
| 48 |
+
num_heads: [4, 8, 12]
|
| 49 |
+
sub_k: [[1, 1], [2, 1]]
|
| 50 |
+
mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
|
| 51 |
+
local_mixer: [[5, 5], [5, 5], [5, 5]]
|
| 52 |
+
last_stage: False
|
| 53 |
+
prenorm: True
|
| 54 |
+
Decoder:
|
| 55 |
+
name: CPPDDecoder
|
| 56 |
+
vis_seq: 128
|
| 57 |
+
num_layer: 2
|
| 58 |
+
pos_len: False
|
| 59 |
+
rec_layer: 1
|
| 60 |
+
|
| 61 |
+
Loss:
|
| 62 |
+
name: CPPDLoss
|
| 63 |
+
ignore_index: 100
|
| 64 |
+
smoothing: True
|
| 65 |
+
pos_len: False
|
| 66 |
+
sideloss_weight: 1.0
|
| 67 |
+
|
| 68 |
+
PostProcess:
|
| 69 |
+
name: CPPDLabelDecode
|
| 70 |
+
character_dict_path: *character_dict_path
|
| 71 |
+
use_space_char: *use_space_char
|
| 72 |
+
|
| 73 |
+
Metric:
|
| 74 |
+
name: RecMetric
|
| 75 |
+
main_indicator: acc
|
| 76 |
+
is_filter: True
|
| 77 |
+
|
| 78 |
+
Train:
|
| 79 |
+
dataset:
|
| 80 |
+
name: LMDBDataSet
|
| 81 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
| 82 |
+
transforms:
|
| 83 |
+
- DecodeImagePIL: # load image
|
| 84 |
+
img_mode: RGB
|
| 85 |
+
- PARSeqAugPIL:
|
| 86 |
+
- CPPDLabelEncode: # Class handling label
|
| 87 |
+
pos_len: False
|
| 88 |
+
character_dict_path: *character_dict_path
|
| 89 |
+
use_space_char: *use_space_char
|
| 90 |
+
max_text_length: *max_text_length
|
| 91 |
+
- RecTVResize:
|
| 92 |
+
image_shape: [32, 128]
|
| 93 |
+
padding: False
|
| 94 |
+
- KeepKeys:
|
| 95 |
+
keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
|
| 96 |
+
loader:
|
| 97 |
+
shuffle: True
|
| 98 |
+
batch_size_per_card: 256
|
| 99 |
+
drop_last: True
|
| 100 |
+
num_workers: 4
|
| 101 |
+
|
| 102 |
+
Eval:
|
| 103 |
+
dataset:
|
| 104 |
+
name: LMDBDataSet
|
| 105 |
+
data_dir: ../evaluation/
|
| 106 |
+
transforms:
|
| 107 |
+
- DecodeImagePIL: # load image
|
| 108 |
+
img_mode: RGB
|
| 109 |
+
- CPPDLabelEncode: # Class handling label
|
| 110 |
+
pos_len: False
|
| 111 |
+
character_dict_path: *character_dict_path
|
| 112 |
+
use_space_char: *use_space_char
|
| 113 |
+
max_text_length: *max_text_length
|
| 114 |
+
- RecTVResize:
|
| 115 |
+
image_shape: [32, 128]
|
| 116 |
+
padding: False
|
| 117 |
+
- KeepKeys:
|
| 118 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 119 |
+
loader:
|
| 120 |
+
shuffle: False
|
| 121 |
+
drop_last: False
|
| 122 |
+
batch_size_per_card: 128
|
| 123 |
+
num_workers: 4
|
configs/rec/cppd/svtr_base_cppd_syn.yml
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 60
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/syn/svtr_base_cppd/
|
| 7 |
+
save_epoch_step: [15, 1]
|
| 8 |
+
# evaluation is run every 2000 iterations
|
| 9 |
+
eval_batch_step: [0, 500]
|
| 10 |
+
eval_epoch_step: [0, 1]
|
| 11 |
+
cal_metric_during_train: True
|
| 12 |
+
pretrained_model:
|
| 13 |
+
checkpoints:
|
| 14 |
+
use_tensorboard: false
|
| 15 |
+
infer_img:
|
| 16 |
+
# for data or label process
|
| 17 |
+
character_dict_path: &character_dict_path
|
| 18 |
+
# ./tools/utils/EN_symbol_dict.txt # 96en
|
| 19 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
| 20 |
+
max_text_length: &max_text_length 25
|
| 21 |
+
use_space_char: &use_space_char False
|
| 22 |
+
save_res_path: ./output/rec/syn/predicts_svtr_base_cppd.txt
|
| 23 |
+
use_amp: True
|
| 24 |
+
|
| 25 |
+
Optimizer:
|
| 26 |
+
name: AdamW
|
| 27 |
+
lr: 0.0005 # for 4gpus bs256/gpu
|
| 28 |
+
weight_decay: 0.05
|
| 29 |
+
filter_bias_and_bn: True
|
| 30 |
+
|
| 31 |
+
LRScheduler:
|
| 32 |
+
name: CosineAnnealingLR
|
| 33 |
+
warmup_epoch: 6
|
| 34 |
+
|
| 35 |
+
Architecture:
|
| 36 |
+
model_type: rec
|
| 37 |
+
algorithm: CPPD
|
| 38 |
+
in_channels: 3
|
| 39 |
+
Transform:
|
| 40 |
+
Encoder:
|
| 41 |
+
name: SVTRNet
|
| 42 |
+
img_size: [32, 100]
|
| 43 |
+
out_char_num: 25
|
| 44 |
+
out_channels: 256
|
| 45 |
+
patch_merging: 'Conv'
|
| 46 |
+
embed_dim: [128, 256, 384]
|
| 47 |
+
depth: [6, 6, 4]
|
| 48 |
+
num_heads: [4, 8, 12]
|
| 49 |
+
mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
|
| 50 |
+
local_mixer: [[5, 5], [5, 5], [5, 5]]
|
| 51 |
+
last_stage: False
|
| 52 |
+
prenorm: True
|
| 53 |
+
Decoder:
|
| 54 |
+
name: CPPDDecoder
|
| 55 |
+
vis_seq: 50
|
| 56 |
+
num_layer: 3
|
| 57 |
+
pos_len: False
|
| 58 |
+
rec_layer: 1
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
Loss:
|
| 62 |
+
name: CPPDLoss
|
| 63 |
+
ignore_index: 100
|
| 64 |
+
smoothing: True
|
| 65 |
+
pos_len: False
|
| 66 |
+
sideloss_weight: 1.0
|
| 67 |
+
|
| 68 |
+
PostProcess:
|
| 69 |
+
name: CPPDLabelDecode
|
| 70 |
+
character_dict_path: *character_dict_path
|
| 71 |
+
use_space_char: *use_space_char
|
| 72 |
+
|
| 73 |
+
Metric:
|
| 74 |
+
name: RecMetric
|
| 75 |
+
main_indicator: acc
|
| 76 |
+
|
| 77 |
+
Train:
|
| 78 |
+
dataset:
|
| 79 |
+
name: STRLMDBDataSet
|
| 80 |
+
data_dir: ./
|
| 81 |
+
transforms:
|
| 82 |
+
- DecodeImage: # load image
|
| 83 |
+
img_mode: BGR
|
| 84 |
+
channel_first: False
|
| 85 |
+
# - SVTRRAug:
|
| 86 |
+
- CPPDLabelEncode: # Class handling label
|
| 87 |
+
pos_len: False
|
| 88 |
+
character_dict_path: *character_dict_path
|
| 89 |
+
use_space_char: *use_space_char
|
| 90 |
+
max_text_length: *max_text_length
|
| 91 |
+
- SVTRResize:
|
| 92 |
+
image_shape: [3, 32, 100]
|
| 93 |
+
padding: False
|
| 94 |
+
- KeepKeys:
|
| 95 |
+
keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
|
| 96 |
+
loader:
|
| 97 |
+
shuffle: True
|
| 98 |
+
batch_size_per_card: 256
|
| 99 |
+
drop_last: True
|
| 100 |
+
num_workers: 8
|
| 101 |
+
|
| 102 |
+
Eval:
|
| 103 |
+
dataset:
|
| 104 |
+
name: LMDBDataSet
|
| 105 |
+
data_dir: ../evaluation/
|
| 106 |
+
transforms:
|
| 107 |
+
- DecodeImage: # load image
|
| 108 |
+
img_mode: BGR
|
| 109 |
+
channel_first: False
|
| 110 |
+
- CPPDLabelEncode: # Class handling label
|
| 111 |
+
pos_len: False
|
| 112 |
+
character_dict_path: *character_dict_path
|
| 113 |
+
use_space_char: *use_space_char
|
| 114 |
+
max_text_length: *max_text_length
|
| 115 |
+
- SVTRResize:
|
| 116 |
+
image_shape: [3, 32, 100]
|
| 117 |
+
padding: False
|
| 118 |
+
- KeepKeys:
|
| 119 |
+
keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
|
| 120 |
+
loader:
|
| 121 |
+
shuffle: False
|
| 122 |
+
drop_last: False
|
| 123 |
+
batch_size_per_card: 256
|
| 124 |
+
num_workers: 4
|
configs/rec/cppd/svtrv2_cppd.yml
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_cppd/
|
| 7 |
+
save_epoch_step: [15, 1]
|
| 8 |
+
# evaluation is run every 2000 iterations
|
| 9 |
+
eval_batch_step: [0, 500]
|
| 10 |
+
eval_epoch_step: [0, 1]
|
| 11 |
+
cal_metric_during_train: True
|
| 12 |
+
pretrained_model:
|
| 13 |
+
checkpoints:
|
| 14 |
+
use_tensorboard: false
|
| 15 |
+
infer_img:
|
| 16 |
+
# for data or label process
|
| 17 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
| 18 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
| 19 |
+
max_text_length: &max_text_length 25
|
| 20 |
+
use_space_char: &use_space_char False
|
| 21 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_cppd.txt
|
| 22 |
+
use_amp: True
|
| 23 |
+
|
| 24 |
+
Optimizer:
|
| 25 |
+
name: AdamW
|
| 26 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
| 27 |
+
weight_decay: 0.05
|
| 28 |
+
filter_bias_and_bn: True
|
| 29 |
+
|
| 30 |
+
LRScheduler:
|
| 31 |
+
name: OneCycleLR
|
| 32 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
| 33 |
+
cycle_momentum: False
|
| 34 |
+
|
| 35 |
+
Architecture:
|
| 36 |
+
model_type: rec
|
| 37 |
+
algorithm: CPPD
|
| 38 |
+
in_channels: 3
|
| 39 |
+
Transform:
|
| 40 |
+
Encoder:
|
| 41 |
+
name: SVTRv2LNConvTwo33
|
| 42 |
+
use_pos_embed: False
|
| 43 |
+
out_channels: 256
|
| 44 |
+
dims: [128, 256, 384]
|
| 45 |
+
depths: [6, 6, 6]
|
| 46 |
+
num_heads: [4, 8, 12]
|
| 47 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
| 48 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
| 49 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
| 50 |
+
last_stage: false
|
| 51 |
+
feat2d: False
|
| 52 |
+
Decoder:
|
| 53 |
+
name: CPPDDecoder
|
| 54 |
+
ds: True
|
| 55 |
+
num_layer: 2
|
| 56 |
+
pos_len: False
|
| 57 |
+
rec_layer: 1
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
Loss:
|
| 61 |
+
name: CPPDLoss
|
| 62 |
+
ignore_index: 100
|
| 63 |
+
smoothing: True
|
| 64 |
+
pos_len: False
|
| 65 |
+
sideloss_weight: 1.0
|
| 66 |
+
|
| 67 |
+
PostProcess:
|
| 68 |
+
name: CPPDLabelDecode
|
| 69 |
+
character_dict_path: *character_dict_path
|
| 70 |
+
use_space_char: *use_space_char
|
| 71 |
+
|
| 72 |
+
Metric:
|
| 73 |
+
name: RecMetric
|
| 74 |
+
main_indicator: acc
|
| 75 |
+
is_filter: True
|
| 76 |
+
|
| 77 |
+
Train:
|
| 78 |
+
dataset:
|
| 79 |
+
name: RatioDataSetTVResize
|
| 80 |
+
ds_width: True
|
| 81 |
+
padding: false
|
| 82 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
| 83 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
| 84 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
| 85 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
| 86 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
| 87 |
+
]
|
| 88 |
+
transforms:
|
| 89 |
+
- DecodeImagePIL: # load image
|
| 90 |
+
img_mode: RGB
|
| 91 |
+
- PARSeqAugPIL:
|
| 92 |
+
- CPPDLabelEncode: # Class handling label
|
| 93 |
+
pos_len: False
|
| 94 |
+
character_dict_path: *character_dict_path
|
| 95 |
+
use_space_char: *use_space_char
|
| 96 |
+
max_text_length: *max_text_length
|
| 97 |
+
- KeepKeys:
|
| 98 |
+
keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
|
| 99 |
+
sampler:
|
| 100 |
+
name: RatioSampler
|
| 101 |
+
scales: [[128, 32]] # w, h
|
| 102 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 103 |
+
first_bs: &bs 256
|
| 104 |
+
fix_bs: false
|
| 105 |
+
divided_factor: [4, 16] # w, h
|
| 106 |
+
is_training: True
|
| 107 |
+
loader:
|
| 108 |
+
shuffle: True
|
| 109 |
+
batch_size_per_card: *bs
|
| 110 |
+
drop_last: True
|
| 111 |
+
max_ratio: &max_ratio 4
|
| 112 |
+
num_workers: 4
|
| 113 |
+
|
| 114 |
+
Eval:
|
| 115 |
+
dataset:
|
| 116 |
+
name: RatioDataSetTVResize
|
| 117 |
+
ds_width: True
|
| 118 |
+
padding: False
|
| 119 |
+
data_dir_list: [
|
| 120 |
+
'../evaluation/CUTE80',
|
| 121 |
+
'../evaluation/IC13_857',
|
| 122 |
+
'../evaluation/IC15_1811',
|
| 123 |
+
'../evaluation/IIIT5k',
|
| 124 |
+
'../evaluation/SVT',
|
| 125 |
+
'../evaluation/SVTP',
|
| 126 |
+
]
|
| 127 |
+
transforms:
|
| 128 |
+
- DecodeImagePIL: # load image
|
| 129 |
+
img_mode: RGB
|
| 130 |
+
- CPPDLabelEncode: # Class handling label
|
| 131 |
+
pos_len: False
|
| 132 |
+
character_dict_path: *character_dict_path
|
| 133 |
+
use_space_char: *use_space_char
|
| 134 |
+
max_text_length: *max_text_length
|
| 135 |
+
- KeepKeys:
|
| 136 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 137 |
+
sampler:
|
| 138 |
+
name: RatioSampler
|
| 139 |
+
scales: [[128, 32]] # w, h
|
| 140 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 141 |
+
first_bs: *bs
|
| 142 |
+
fix_bs: false
|
| 143 |
+
divided_factor: [4, 16] # w, h
|
| 144 |
+
is_training: False
|
| 145 |
+
loader:
|
| 146 |
+
shuffle: False
|
| 147 |
+
drop_last: False
|
| 148 |
+
batch_size_per_card: *bs
|
| 149 |
+
max_ratio: *max_ratio
|
| 150 |
+
num_workers: 4
|
configs/rec/dan/resnet45_fpn_dan.yml
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/resnet45_fpn_dan/
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: True
|
| 10 |
+
pretrained_model:
|
| 11 |
+
checkpoints:
|
| 12 |
+
use_tensorboard: false
|
| 13 |
+
infer_img:
|
| 14 |
+
# for data or label process
|
| 15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 16 |
+
max_text_length: 25
|
| 17 |
+
use_space_char: False
|
| 18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_resnet45_fpn_dan.txt
|
| 19 |
+
use_amp: True
|
| 20 |
+
grad_clip_val: 20
|
| 21 |
+
|
| 22 |
+
Optimizer:
|
| 23 |
+
name: Adam
|
| 24 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
| 25 |
+
weight_decay: 0.0
|
| 26 |
+
filter_bias_and_bn: False
|
| 27 |
+
|
| 28 |
+
LRScheduler:
|
| 29 |
+
name: OneCycleLR
|
| 30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
| 31 |
+
cycle_momentum: False
|
| 32 |
+
|
| 33 |
+
Architecture:
|
| 34 |
+
model_type: rec
|
| 35 |
+
algorithm: DAN
|
| 36 |
+
Transform:
|
| 37 |
+
Encoder:
|
| 38 |
+
name: ResNet45
|
| 39 |
+
in_channels: 3
|
| 40 |
+
strides: [2, 1, 2, 1, 1]
|
| 41 |
+
return_list: True
|
| 42 |
+
Decoder:
|
| 43 |
+
name: DANDecoder
|
| 44 |
+
max_len: 25
|
| 45 |
+
channels_list: [64, 128, 256, 512]
|
| 46 |
+
strides_list: [[2, 2], [1, 1], [1, 1]]
|
| 47 |
+
in_shape: [8, 32]
|
| 48 |
+
depth: 4
|
| 49 |
+
|
| 50 |
+
Loss:
|
| 51 |
+
name: ARLoss
|
| 52 |
+
|
| 53 |
+
PostProcess:
|
| 54 |
+
name: ARLabelDecode
|
| 55 |
+
|
| 56 |
+
Metric:
|
| 57 |
+
name: RecMetric
|
| 58 |
+
main_indicator: acc
|
| 59 |
+
is_filter: True
|
| 60 |
+
|
| 61 |
+
Train:
|
| 62 |
+
dataset:
|
| 63 |
+
name: LMDBDataSet
|
| 64 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
| 65 |
+
transforms:
|
| 66 |
+
- DecodeImagePIL: # load image
|
| 67 |
+
img_mode: RGB
|
| 68 |
+
- PARSeqAugPIL:
|
| 69 |
+
- ARLabelEncode:
|
| 70 |
+
- RecTVResize:
|
| 71 |
+
image_shape: [32, 128]
|
| 72 |
+
padding: False
|
| 73 |
+
- KeepKeys:
|
| 74 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 75 |
+
loader:
|
| 76 |
+
shuffle: True
|
| 77 |
+
batch_size_per_card: 256
|
| 78 |
+
drop_last: True
|
| 79 |
+
num_workers: 4
|
| 80 |
+
|
| 81 |
+
Eval:
|
| 82 |
+
dataset:
|
| 83 |
+
name: LMDBDataSet
|
| 84 |
+
data_dir: ../evaluation
|
| 85 |
+
transforms:
|
| 86 |
+
- DecodeImagePIL: # load image
|
| 87 |
+
img_mode: RGB
|
| 88 |
+
- ARLabelEncode:
|
| 89 |
+
- RecTVResize:
|
| 90 |
+
image_shape: [32, 128]
|
| 91 |
+
padding: False
|
| 92 |
+
- KeepKeys:
|
| 93 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 94 |
+
loader:
|
| 95 |
+
shuffle: False
|
| 96 |
+
drop_last: False
|
| 97 |
+
batch_size_per_card: 256
|
| 98 |
+
num_workers: 2
|
configs/rec/dan/svtrv2_dan.yml
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_dan
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: True
|
| 10 |
+
pretrained_model:
|
| 11 |
+
checkpoints:
|
| 12 |
+
use_tensorboard: false
|
| 13 |
+
infer_img:
|
| 14 |
+
# for data or label process
|
| 15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 16 |
+
max_text_length: 25
|
| 17 |
+
use_space_char: False
|
| 18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_dan.txt
|
| 19 |
+
use_amp: True
|
| 20 |
+
grad_clip_val: 20
|
| 21 |
+
|
| 22 |
+
Optimizer:
|
| 23 |
+
name: AdamW
|
| 24 |
+
lr: 0.00065 # 4gpus 256bs/gpu
|
| 25 |
+
weight_decay: 0.05
|
| 26 |
+
filter_bias_and_bn: True
|
| 27 |
+
|
| 28 |
+
LRScheduler:
|
| 29 |
+
name: OneCycleLR
|
| 30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
| 31 |
+
cycle_momentum: False
|
| 32 |
+
|
| 33 |
+
Architecture:
|
| 34 |
+
model_type: rec
|
| 35 |
+
algorithm: DAN
|
| 36 |
+
Transform:
|
| 37 |
+
Encoder:
|
| 38 |
+
name: SVTRv2LNConvTwo33
|
| 39 |
+
use_pos_embed: False
|
| 40 |
+
out_channels: 256
|
| 41 |
+
dims: [128, 256, 384]
|
| 42 |
+
depths: [6, 6, 6]
|
| 43 |
+
num_heads: [4, 8, 12]
|
| 44 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
| 45 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
| 46 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
| 47 |
+
last_stage: false
|
| 48 |
+
feat2d: True
|
| 49 |
+
Decoder:
|
| 50 |
+
name: DANDecoder
|
| 51 |
+
use_cam: False
|
| 52 |
+
max_len: 25
|
| 53 |
+
|
| 54 |
+
Loss:
|
| 55 |
+
name: ARLoss
|
| 56 |
+
|
| 57 |
+
PostProcess:
|
| 58 |
+
name: ARLabelDecode
|
| 59 |
+
|
| 60 |
+
Metric:
|
| 61 |
+
name: RecMetric
|
| 62 |
+
main_indicator: acc
|
| 63 |
+
is_filter: True
|
| 64 |
+
|
| 65 |
+
Train:
|
| 66 |
+
dataset:
|
| 67 |
+
name: RatioDataSetTVResize
|
| 68 |
+
ds_width: True
|
| 69 |
+
padding: false
|
| 70 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_filter_train_challenging',
|
| 71 |
+
'../Union14M-L-LMDB-Filtered/filter_filter_train_hard',
|
| 72 |
+
'../Union14M-L-LMDB-Filtered/filter_filter_train_medium',
|
| 73 |
+
'../Union14M-L-LMDB-Filtered/filter_filter_train_normal',
|
| 74 |
+
'../Union14M-L-LMDB-Filtered/filter_filter_train_easy',
|
| 75 |
+
]
|
| 76 |
+
transforms:
|
| 77 |
+
- DecodeImagePIL: # load image
|
| 78 |
+
img_mode: RGB
|
| 79 |
+
- PARSeqAugPIL:
|
| 80 |
+
- ARLabelEncode:
|
| 81 |
+
- KeepKeys:
|
| 82 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 83 |
+
sampler:
|
| 84 |
+
name: RatioSampler
|
| 85 |
+
scales: [[128, 32]] # w, h
|
| 86 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 87 |
+
first_bs: &bs 256
|
| 88 |
+
fix_bs: false
|
| 89 |
+
divided_factor: [4, 16] # w, h
|
| 90 |
+
is_training: True
|
| 91 |
+
loader:
|
| 92 |
+
shuffle: True
|
| 93 |
+
batch_size_per_card: *bs
|
| 94 |
+
drop_last: True
|
| 95 |
+
max_ratio: &max_ratio 4
|
| 96 |
+
num_workers: 4
|
| 97 |
+
|
| 98 |
+
Eval:
|
| 99 |
+
dataset:
|
| 100 |
+
name: RatioDataSetTVResize
|
| 101 |
+
ds_width: True
|
| 102 |
+
padding: False
|
| 103 |
+
data_dir_list: [
|
| 104 |
+
'../evaluation/CUTE80',
|
| 105 |
+
'../evaluation/IC13_857',
|
| 106 |
+
'../evaluation/IC15_1811',
|
| 107 |
+
'../evaluation/IIIT5k',
|
| 108 |
+
'../evaluation/SVT',
|
| 109 |
+
'../evaluation/SVTP',
|
| 110 |
+
]
|
| 111 |
+
transforms:
|
| 112 |
+
- DecodeImagePIL: # load image
|
| 113 |
+
img_mode: RGB
|
| 114 |
+
- ARLabelEncode:
|
| 115 |
+
- KeepKeys:
|
| 116 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 117 |
+
sampler:
|
| 118 |
+
name: RatioSampler
|
| 119 |
+
scales: [[128, 32]] # w, h
|
| 120 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 121 |
+
first_bs: *bs
|
| 122 |
+
fix_bs: false
|
| 123 |
+
divided_factor: [4, 16] # w, h
|
| 124 |
+
is_training: False
|
| 125 |
+
loader:
|
| 126 |
+
shuffle: False
|
| 127 |
+
drop_last: False
|
| 128 |
+
batch_size_per_card: *bs
|
| 129 |
+
max_ratio: *max_ratio
|
| 130 |
+
num_workers: 4
|
configs/rec/dptr/dptr_parseq_pretrain.yml
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: /share/ckpt/zhaoshuai/openocr/dptr_parseq/
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: True
|
| 10 |
+
pretrained_model:
|
| 11 |
+
checkpoints:
|
| 12 |
+
use_tensorboard: false
|
| 13 |
+
infer_img:
|
| 14 |
+
# for data or label process
|
| 15 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
|
| 16 |
+
max_text_length: &max_text_length 25
|
| 17 |
+
use_space_char: &use_space_char False
|
| 18 |
+
use_amp: True
|
| 19 |
+
save_res_path: /share/ckpt/zhaoshuai/openocr/dptr_parseq/predicts_dptr_parseq.txt
|
| 20 |
+
grad_clip_val: 20
|
| 21 |
+
|
| 22 |
+
Optimizer:
|
| 23 |
+
name: AdamW
|
| 24 |
+
lr: 0.001485 # 2gpus 384bs/gpu
|
| 25 |
+
weight_decay: 0.
|
| 26 |
+
filter_bias_and_bn: False
|
| 27 |
+
|
| 28 |
+
LRScheduler:
|
| 29 |
+
name: OneCycleLR
|
| 30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
| 31 |
+
cycle_momentum: False
|
| 32 |
+
|
| 33 |
+
Architecture:
|
| 34 |
+
model_type: rec
|
| 35 |
+
algorithm: DPTR
|
| 36 |
+
Decoder:
|
| 37 |
+
name: DptrParseq
|
| 38 |
+
decode_ar: True
|
| 39 |
+
refine_iters: 1
|
| 40 |
+
is_pretrain: True
|
| 41 |
+
ORP_path: /share/ckpt/zhaoshuai/parseq/clip_background.pth
|
| 42 |
+
|
| 43 |
+
Loss:
|
| 44 |
+
name: PARSeqLoss
|
| 45 |
+
|
| 46 |
+
PostProcess:
|
| 47 |
+
name: ARLabelDecode
|
| 48 |
+
character_dict_path: *character_dict_path
|
| 49 |
+
use_space_char: *use_space_char
|
| 50 |
+
|
| 51 |
+
Metric:
|
| 52 |
+
name: RecMetric
|
| 53 |
+
main_indicator: acc
|
| 54 |
+
is_filter: True
|
| 55 |
+
|
| 56 |
+
Train:
|
| 57 |
+
dataset:
|
| 58 |
+
name: TextLMDBDataSet
|
| 59 |
+
data_dir: /share/test/zhaoshuai/parseq-data/data/train/real/ArT
|
| 60 |
+
transforms:
|
| 61 |
+
- DPTRLabelEncode: # Class handling label
|
| 62 |
+
character_dict_path: *character_dict_path
|
| 63 |
+
use_space_char: *use_space_char
|
| 64 |
+
max_text_length: *max_text_length
|
| 65 |
+
- KeepKeys:
|
| 66 |
+
keep_keys: ['clip_label', 'label'] # dataloader will return list in this order
|
| 67 |
+
loader:
|
| 68 |
+
shuffle: True
|
| 69 |
+
batch_size_per_card: 256
|
| 70 |
+
drop_last: True
|
| 71 |
+
num_workers: 4
|
| 72 |
+
|
| 73 |
+
Eval:
|
| 74 |
+
dataset:
|
| 75 |
+
name: TextLMDBDataSet
|
| 76 |
+
data_dir: /share/test/zhaoshuai/parseq-data/data/val
|
| 77 |
+
transforms:
|
| 78 |
+
- DPTRLabelEncode: # Class handling label
|
| 79 |
+
character_dict_path: *character_dict_path
|
| 80 |
+
use_space_char: *use_space_char
|
| 81 |
+
max_text_length: *max_text_length
|
| 82 |
+
- KeepKeys:
|
| 83 |
+
keep_keys: ['clip_label', 'label'] # dataloader will return list in this order
|
| 84 |
+
loader:
|
| 85 |
+
shuffle: False
|
| 86 |
+
drop_last: False
|
| 87 |
+
batch_size_per_card: 256
|
| 88 |
+
num_workers: 2
|
configs/rec/focalsvtr/focalsvtr_ctc.yml
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/focalsvtr_ctc/
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: True
|
| 10 |
+
pretrained_model:
|
| 11 |
+
checkpoints:
|
| 12 |
+
use_tensorboard: false
|
| 13 |
+
infer_img:
|
| 14 |
+
# for data or label process
|
| 15 |
+
character_dict_path: &character_dict_path
|
| 16 |
+
# ./tools/utils/EN_symbol_dict.txt
|
| 17 |
+
max_text_length: &max_text_length 25
|
| 18 |
+
use_space_char: &use_space_char False
|
| 19 |
+
save_res_path: ./output/rec/u14m_filter/predicts_focalsvtr_ctc.txt
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
Optimizer:
|
| 23 |
+
name: AdamW
|
| 24 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
| 25 |
+
weight_decay: 0.05
|
| 26 |
+
filter_bias_and_bn: True
|
| 27 |
+
|
| 28 |
+
LRScheduler:
|
| 29 |
+
name: OneCycleLR
|
| 30 |
+
|
| 31 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
| 32 |
+
cycle_momentum: False
|
| 33 |
+
|
| 34 |
+
Architecture:
|
| 35 |
+
model_type: rec
|
| 36 |
+
algorithm: SVTR
|
| 37 |
+
Transform:
|
| 38 |
+
Encoder:
|
| 39 |
+
name: FocalSVTR
|
| 40 |
+
img_size: [32, 128]
|
| 41 |
+
depths: [6, 6, 6]
|
| 42 |
+
embed_dim: 96
|
| 43 |
+
sub_k: [[1, 1], [2, 1], [1, 1]]
|
| 44 |
+
focal_levels: [3, 3, 3]
|
| 45 |
+
out_channels: 256
|
| 46 |
+
last_stage: True
|
| 47 |
+
Decoder:
|
| 48 |
+
name: CTCDecoder
|
| 49 |
+
|
| 50 |
+
Loss:
|
| 51 |
+
name: CTCLoss
|
| 52 |
+
zero_infinity: True
|
| 53 |
+
|
| 54 |
+
PostProcess:
|
| 55 |
+
name: CTCLabelDecode
|
| 56 |
+
character_dict_path: *character_dict_path
|
| 57 |
+
use_space_char: *use_space_char
|
| 58 |
+
|
| 59 |
+
Metric:
|
| 60 |
+
name: RecMetric
|
| 61 |
+
main_indicator: acc
|
| 62 |
+
is_filter: True
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
Train:
|
| 66 |
+
dataset:
|
| 67 |
+
name: RatioDataSet
|
| 68 |
+
ds_width: True
|
| 69 |
+
padding: &padding False
|
| 70 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
| 71 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
| 72 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
| 73 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
| 74 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
| 75 |
+
]
|
| 76 |
+
transforms:
|
| 77 |
+
- DecodeImage: # load image
|
| 78 |
+
img_mode: BGR
|
| 79 |
+
channel_first: False
|
| 80 |
+
- PARSeqAug:
|
| 81 |
+
- CTCLabelEncode: # Class handling label
|
| 82 |
+
character_dict_path: *character_dict_path
|
| 83 |
+
use_space_char: *use_space_char
|
| 84 |
+
max_text_length: *max_text_length
|
| 85 |
+
- KeepKeys:
|
| 86 |
+
keep_keys: ['image', 'label', 'length']
|
| 87 |
+
sampler:
|
| 88 |
+
name: RatioSampler
|
| 89 |
+
scales: [[128, 32]] # w, h
|
| 90 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 91 |
+
first_bs: &bs 256
|
| 92 |
+
fix_bs: false
|
| 93 |
+
divided_factor: [4, 16] # w, h
|
| 94 |
+
is_training: True
|
| 95 |
+
loader:
|
| 96 |
+
shuffle: True
|
| 97 |
+
batch_size_per_card: *bs
|
| 98 |
+
drop_last: True
|
| 99 |
+
max_ratio: 12
|
| 100 |
+
num_workers: 4
|
| 101 |
+
|
| 102 |
+
Eval:
|
| 103 |
+
dataset:
|
| 104 |
+
name: RatioDataSet
|
| 105 |
+
ds_width: True
|
| 106 |
+
padding: True
|
| 107 |
+
data_dir_list: ['../evaluation/CUTE80',
|
| 108 |
+
'../evaluation/IC13_857',
|
| 109 |
+
'../evaluation/IC15_1811',
|
| 110 |
+
'../evaluation/IIIT5k',
|
| 111 |
+
'../evaluation/SVT',
|
| 112 |
+
'../evaluation/SVTP',
|
| 113 |
+
]
|
| 114 |
+
transforms:
|
| 115 |
+
- DecodeImage: # load image
|
| 116 |
+
img_mode: BGR
|
| 117 |
+
channel_first: False
|
| 118 |
+
- CTCLabelEncode: # Class handling label
|
| 119 |
+
character_dict_path: *character_dict_path
|
| 120 |
+
use_space_char: *use_space_char
|
| 121 |
+
max_text_length: *max_text_length
|
| 122 |
+
- KeepKeys:
|
| 123 |
+
keep_keys: ['image', 'label', 'length']
|
| 124 |
+
sampler:
|
| 125 |
+
name: RatioSampler
|
| 126 |
+
scales: [[128, 32]] # w, h
|
| 127 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 128 |
+
first_bs: 128
|
| 129 |
+
fix_bs: false
|
| 130 |
+
divided_factor: [4, 16] # w, h
|
| 131 |
+
is_training: False
|
| 132 |
+
loader:
|
| 133 |
+
shuffle: False
|
| 134 |
+
drop_last: False
|
| 135 |
+
batch_size_per_card: 128
|
| 136 |
+
max_ratio: 12
|
| 137 |
+
num_workers: 4
|
configs/rec/gtc/svtrv2_lnconv_nrtr_gtc.yml
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/svtrv2_lnconv_nrtr_gtc
|
| 7 |
+
save_epoch_step: [15, 1]
|
| 8 |
+
# evaluation is run every 2000 iterations
|
| 9 |
+
eval_batch_step: [0, 500]
|
| 10 |
+
eval_epoch_step: [0, 1]
|
| 11 |
+
cal_metric_during_train: True
|
| 12 |
+
pretrained_model:
|
| 13 |
+
checkpoints:
|
| 14 |
+
use_tensorboard: false
|
| 15 |
+
infer_img: ../ltb/img
|
| 16 |
+
# for data or label process
|
| 17 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
| 18 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
| 19 |
+
max_text_length: &max_text_length 25
|
| 20 |
+
use_space_char: &use_space_char False
|
| 21 |
+
save_res_path: ./output/rec/predicts_smtr.txt
|
| 22 |
+
use_amp: True
|
| 23 |
+
distributed: true
|
| 24 |
+
|
| 25 |
+
Optimizer:
|
| 26 |
+
name: AdamW
|
| 27 |
+
lr: 0.00065
|
| 28 |
+
weight_decay: 0.05
|
| 29 |
+
filter_bias_and_bn: True
|
| 30 |
+
|
| 31 |
+
LRScheduler:
|
| 32 |
+
name: OneCycleLR
|
| 33 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
| 34 |
+
cycle_momentum: False
|
| 35 |
+
|
| 36 |
+
Architecture:
|
| 37 |
+
model_type: rec
|
| 38 |
+
algorithm: BGPD
|
| 39 |
+
in_channels: 3
|
| 40 |
+
Transform:
|
| 41 |
+
Encoder:
|
| 42 |
+
name: SVTRv2LNConvTwo33
|
| 43 |
+
use_pos_embed: False
|
| 44 |
+
out_channels: 256
|
| 45 |
+
dims: [128, 256, 384]
|
| 46 |
+
depths: [6, 6, 6]
|
| 47 |
+
num_heads: [4, 8, 12]
|
| 48 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
| 49 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
| 50 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
| 51 |
+
last_stage: false
|
| 52 |
+
feat2d: True
|
| 53 |
+
Decoder:
|
| 54 |
+
name: GTCDecoder
|
| 55 |
+
infer_gtc: True
|
| 56 |
+
detach: False
|
| 57 |
+
gtc_decoder:
|
| 58 |
+
name: NRTRDecoder
|
| 59 |
+
num_encoder_layers: -1
|
| 60 |
+
beam_size: 0
|
| 61 |
+
num_decoder_layers: 2
|
| 62 |
+
nhead: 12
|
| 63 |
+
max_len: *max_text_length
|
| 64 |
+
ctc_decoder:
|
| 65 |
+
name: RCTCDecoder
|
| 66 |
+
|
| 67 |
+
Loss:
|
| 68 |
+
name: GTCLoss
|
| 69 |
+
gtc_loss:
|
| 70 |
+
name: ARLoss
|
| 71 |
+
|
| 72 |
+
PostProcess:
|
| 73 |
+
name: GTCLabelDecode
|
| 74 |
+
gtc_label_decode:
|
| 75 |
+
name: ARLabelDecode
|
| 76 |
+
character_dict_path: *character_dict_path
|
| 77 |
+
use_space_char: *use_space_char
|
| 78 |
+
|
| 79 |
+
Metric:
|
| 80 |
+
name: RecGTCMetric
|
| 81 |
+
main_indicator: acc
|
| 82 |
+
is_filter: True
|
| 83 |
+
|
| 84 |
+
Train:
|
| 85 |
+
dataset:
|
| 86 |
+
name: RatioDataSet
|
| 87 |
+
ds_width: True
|
| 88 |
+
# max_ratio: &max_ratio 4
|
| 89 |
+
# min_ratio: 1
|
| 90 |
+
# base_shape: &base_shape [[64, 64], [96, 48], [112, 40], [128, 32]]
|
| 91 |
+
# base_h: &base_h 32
|
| 92 |
+
# padding: &padding False
|
| 93 |
+
padding: false
|
| 94 |
+
# padding_rand: true
|
| 95 |
+
# padding_doub: true
|
| 96 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
| 97 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
| 98 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
| 99 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
| 100 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
| 101 |
+
]
|
| 102 |
+
transforms:
|
| 103 |
+
- DecodeImage: # load image
|
| 104 |
+
img_mode: BGR
|
| 105 |
+
channel_first: False
|
| 106 |
+
- PARSeqAug:
|
| 107 |
+
- GTCLabelEncode: # Class handling label
|
| 108 |
+
gtc_label_encode:
|
| 109 |
+
name: ARLabelEncode
|
| 110 |
+
character_dict_path: *character_dict_path
|
| 111 |
+
use_space_char: *use_space_char
|
| 112 |
+
max_text_length: *max_text_length
|
| 113 |
+
- KeepKeys:
|
| 114 |
+
keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
|
| 115 |
+
sampler:
|
| 116 |
+
name: RatioSampler
|
| 117 |
+
scales: [[128, 32]] # w, h
|
| 118 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 119 |
+
first_bs: &bs 256
|
| 120 |
+
fix_bs: false
|
| 121 |
+
divided_factor: [4, 16] # w, h
|
| 122 |
+
is_training: True
|
| 123 |
+
loader:
|
| 124 |
+
shuffle: True
|
| 125 |
+
batch_size_per_card: *bs
|
| 126 |
+
drop_last: True
|
| 127 |
+
max_ratio: &max_ratio 4
|
| 128 |
+
num_workers: 4
|
| 129 |
+
|
| 130 |
+
Eval:
|
| 131 |
+
dataset:
|
| 132 |
+
name: RatioDataSet
|
| 133 |
+
ds_width: True
|
| 134 |
+
padding: False
|
| 135 |
+
data_dir_list: [
|
| 136 |
+
'../evaluation/CUTE80',
|
| 137 |
+
'../evaluation/IC13_857',
|
| 138 |
+
'../evaluation/IC15_1811',
|
| 139 |
+
'../evaluation/IIIT5k',
|
| 140 |
+
'../evaluation/SVT',
|
| 141 |
+
'../evaluation/SVTP',
|
| 142 |
+
]
|
| 143 |
+
transforms:
|
| 144 |
+
- DecodeImage: # load image
|
| 145 |
+
img_mode: BGR
|
| 146 |
+
channel_first: False
|
| 147 |
+
- GTCLabelEncode: # Class handling label
|
| 148 |
+
gtc_label_encode:
|
| 149 |
+
name: ARLabelEncode
|
| 150 |
+
character_dict_path: *character_dict_path
|
| 151 |
+
use_space_char: *use_space_char
|
| 152 |
+
max_text_length: *max_text_length
|
| 153 |
+
- KeepKeys:
|
| 154 |
+
keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
|
| 155 |
+
sampler:
|
| 156 |
+
name: RatioSampler
|
| 157 |
+
scales: [[128, 32]] # w, h
|
| 158 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 159 |
+
first_bs: *bs
|
| 160 |
+
fix_bs: false
|
| 161 |
+
divided_factor: [4, 16] # w, h
|
| 162 |
+
is_training: False
|
| 163 |
+
loader:
|
| 164 |
+
shuffle: False
|
| 165 |
+
drop_last: False
|
| 166 |
+
batch_size_per_card: *bs
|
| 167 |
+
max_ratio: *max_ratio
|
| 168 |
+
num_workers: 4
|
configs/rec/gtc/svtrv2_lnconv_smtr_gtc_long_infer.yml
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/svtrv2_lnconv_smtr_gtc_long_infer
|
| 7 |
+
save_epoch_step: [15, 1]
|
| 8 |
+
# evaluation is run every 2000 iterations
|
| 9 |
+
eval_batch_step: [0, 1000]
|
| 10 |
+
eval_epoch_step: [0, 1]
|
| 11 |
+
cal_metric_during_train: True
|
| 12 |
+
pretrained_model:
|
| 13 |
+
checkpoints:
|
| 14 |
+
use_tensorboard: false
|
| 15 |
+
infer_img: ../ltb/img
|
| 16 |
+
# for data or label process
|
| 17 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
| 18 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
| 19 |
+
max_text_length: &max_text_length 25
|
| 20 |
+
use_space_char: &use_space_char False
|
| 21 |
+
save_res_path: ./output/rec/predicts_smtr.txt
|
| 22 |
+
use_amp: True
|
| 23 |
+
distributed: true
|
| 24 |
+
|
| 25 |
+
Optimizer:
|
| 26 |
+
name: AdamW
|
| 27 |
+
lr: 0.000325
|
| 28 |
+
weight_decay: 0.05
|
| 29 |
+
filter_bias_and_bn: True
|
| 30 |
+
|
| 31 |
+
LRScheduler:
|
| 32 |
+
name: OneCycleLR
|
| 33 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
| 34 |
+
cycle_momentum: False
|
| 35 |
+
|
| 36 |
+
Architecture:
|
| 37 |
+
model_type: rec
|
| 38 |
+
algorithm: BGPD
|
| 39 |
+
in_channels: 3
|
| 40 |
+
Transform:
|
| 41 |
+
Encoder:
|
| 42 |
+
name: SVTRv2LNConvTwo33
|
| 43 |
+
use_pos_embed: False
|
| 44 |
+
out_channels: 256
|
| 45 |
+
dims: [128, 256, 384]
|
| 46 |
+
depths: [6, 6, 6]
|
| 47 |
+
num_heads: [4, 8, 12]
|
| 48 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
| 49 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
| 50 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
| 51 |
+
last_stage: false
|
| 52 |
+
feat2d: True
|
| 53 |
+
Decoder:
|
| 54 |
+
name: GTCDecoder
|
| 55 |
+
infer_gtc: False
|
| 56 |
+
detach: False
|
| 57 |
+
gtc_decoder:
|
| 58 |
+
name: SMTRDecoder
|
| 59 |
+
num_layer: 1
|
| 60 |
+
ds: True
|
| 61 |
+
max_len: *max_text_length
|
| 62 |
+
next_mode: &next True
|
| 63 |
+
sub_str_len: &subsl 5
|
| 64 |
+
ctc_decoder:
|
| 65 |
+
name: RCTCDecoder
|
| 66 |
+
|
| 67 |
+
Loss:
|
| 68 |
+
name: CTCLoss
|
| 69 |
+
|
| 70 |
+
PostProcess:
|
| 71 |
+
name: CTCLabelDecode
|
| 72 |
+
character_dict_path: *character_dict_path
|
| 73 |
+
use_space_char: *use_space_char
|
| 74 |
+
|
| 75 |
+
Metric:
|
| 76 |
+
name: RecMetric
|
| 77 |
+
main_indicator: acc
|
| 78 |
+
is_filter: True
|
| 79 |
+
|
| 80 |
+
Train:
|
| 81 |
+
dataset:
|
| 82 |
+
name: RatioDataSetTVResize
|
| 83 |
+
ds_width: True
|
| 84 |
+
padding: false
|
| 85 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
| 86 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
| 87 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
| 88 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
| 89 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
| 90 |
+
]
|
| 91 |
+
transforms:
|
| 92 |
+
- DecodeImagePIL: # load image
|
| 93 |
+
img_mode: RGB
|
| 94 |
+
- PARSeqAugPIL:
|
| 95 |
+
- CTCLabelEncode: # Class handling label
|
| 96 |
+
character_dict_path: *character_dict_path
|
| 97 |
+
use_space_char: *use_space_char
|
| 98 |
+
max_text_length: *max_text_length
|
| 99 |
+
- KeepKeys:
|
| 100 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 101 |
+
sampler:
|
| 102 |
+
name: RatioSampler
|
| 103 |
+
scales: [[128, 32]] # w, h
|
| 104 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 105 |
+
first_bs: &bs 128
|
| 106 |
+
fix_bs: false
|
| 107 |
+
divided_factor: [4, 16] # w, h
|
| 108 |
+
is_training: True
|
| 109 |
+
loader:
|
| 110 |
+
shuffle: True
|
| 111 |
+
batch_size_per_card: *bs
|
| 112 |
+
drop_last: True
|
| 113 |
+
max_ratio: &max_ratio 12
|
| 114 |
+
num_workers: 4
|
| 115 |
+
|
| 116 |
+
Eval:
|
| 117 |
+
dataset:
|
| 118 |
+
name: RatioDataSetTVResize
|
| 119 |
+
ds_width: True
|
| 120 |
+
padding: False
|
| 121 |
+
data_dir_list: [
|
| 122 |
+
'../evaluation/CUTE80',
|
| 123 |
+
'../evaluation/IC13_857',
|
| 124 |
+
'../evaluation/IC15_1811',
|
| 125 |
+
'../evaluation/IIIT5k',
|
| 126 |
+
'../evaluation/SVT',
|
| 127 |
+
'../evaluation/SVTP',
|
| 128 |
+
]
|
| 129 |
+
transforms:
|
| 130 |
+
- DecodeImagePIL: # load image
|
| 131 |
+
img_mode: RGB
|
| 132 |
+
- CTCLabelEncode: # Class handling label
|
| 133 |
+
character_dict_path: *character_dict_path
|
| 134 |
+
use_space_char: *use_space_char
|
| 135 |
+
max_text_length: *max_text_length
|
| 136 |
+
- KeepKeys:
|
| 137 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 138 |
+
sampler:
|
| 139 |
+
name: RatioSampler
|
| 140 |
+
scales: [[128, 32]] # w, h
|
| 141 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 142 |
+
first_bs: *bs
|
| 143 |
+
fix_bs: false
|
| 144 |
+
divided_factor: [4, 16] # w, h
|
| 145 |
+
is_training: False
|
| 146 |
+
loader:
|
| 147 |
+
shuffle: False
|
| 148 |
+
drop_last: False
|
| 149 |
+
batch_size_per_card: *bs
|
| 150 |
+
max_ratio: *max_ratio
|
| 151 |
+
num_workers: 4
|
configs/rec/gtc/svtrv2_lnconv_smtr_gtc_smtr_long.yml
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/svtrv2_lnconv_smtr_gtc_nodetach_smtr_long_infer
|
| 7 |
+
save_epoch_step: [15, 1]
|
| 8 |
+
# evaluation is run every 2000 iterations
|
| 9 |
+
eval_batch_step: [0, 1000]
|
| 10 |
+
eval_epoch_step: [0, 1]
|
| 11 |
+
cal_metric_during_train: True
|
| 12 |
+
pretrained_model:
|
| 13 |
+
checkpoints:
|
| 14 |
+
use_tensorboard: false
|
| 15 |
+
infer_img:
|
| 16 |
+
# for data or label process
|
| 17 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
| 18 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
| 19 |
+
max_text_length: &max_text_length 25
|
| 20 |
+
use_space_char: &use_space_char False
|
| 21 |
+
save_res_path: ./output/rec/predicts_smtr.txt
|
| 22 |
+
use_amp: True
|
| 23 |
+
distributed: true
|
| 24 |
+
|
| 25 |
+
Optimizer:
|
| 26 |
+
name: AdamW
|
| 27 |
+
lr: 0.000325
|
| 28 |
+
weight_decay: 0.05
|
| 29 |
+
filter_bias_and_bn: True
|
| 30 |
+
|
| 31 |
+
LRScheduler:
|
| 32 |
+
name: OneCycleLR
|
| 33 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
| 34 |
+
cycle_momentum: False
|
| 35 |
+
Architecture:
|
| 36 |
+
model_type: rec
|
| 37 |
+
algorithm: BGPD
|
| 38 |
+
in_channels: 3
|
| 39 |
+
Transform:
|
| 40 |
+
Encoder:
|
| 41 |
+
name: SVTRv2LNConvTwo33
|
| 42 |
+
use_pos_embed: False
|
| 43 |
+
out_channels: 256
|
| 44 |
+
dims: [128, 256, 384]
|
| 45 |
+
depths: [6, 6, 6]
|
| 46 |
+
num_heads: [4, 8, 12]
|
| 47 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
| 48 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
| 49 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
| 50 |
+
last_stage: false
|
| 51 |
+
feat2d: True
|
| 52 |
+
Decoder:
|
| 53 |
+
name: GTCDecoder
|
| 54 |
+
infer_gtc: True
|
| 55 |
+
detach: False
|
| 56 |
+
gtc_decoder:
|
| 57 |
+
name: SMTRDecoder
|
| 58 |
+
num_layer: 1
|
| 59 |
+
ds: True
|
| 60 |
+
max_len: *max_text_length
|
| 61 |
+
next_mode: &next True
|
| 62 |
+
sub_str_len: &subsl 5
|
| 63 |
+
infer_aug: True
|
| 64 |
+
ctc_decoder:
|
| 65 |
+
name: RCTCDecoder
|
| 66 |
+
|
| 67 |
+
Loss:
|
| 68 |
+
name: GTCLoss
|
| 69 |
+
ctc_weight: 0.1
|
| 70 |
+
gtc_loss:
|
| 71 |
+
name: SMTRLoss
|
| 72 |
+
|
| 73 |
+
PostProcess:
|
| 74 |
+
name: GTCLabelDecode
|
| 75 |
+
gtc_label_decode:
|
| 76 |
+
name: SMTRLabelDecode
|
| 77 |
+
next_mode: *next
|
| 78 |
+
character_dict_path: *character_dict_path
|
| 79 |
+
use_space_char: *use_space_char
|
| 80 |
+
only_gtc: True
|
| 81 |
+
|
| 82 |
+
Metric:
|
| 83 |
+
name: RecGTCMetric
|
| 84 |
+
main_indicator: acc
|
| 85 |
+
is_filter: True
|
| 86 |
+
|
| 87 |
+
Train:
|
| 88 |
+
dataset:
|
| 89 |
+
name: RatioDataSetTVResize
|
| 90 |
+
ds_width: True
|
| 91 |
+
padding: false
|
| 92 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
| 93 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
| 94 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
| 95 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
| 96 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
| 97 |
+
]
|
| 98 |
+
transforms:
|
| 99 |
+
- DecodeImagePIL: # load image
|
| 100 |
+
img_mode: RGB
|
| 101 |
+
- PARSeqAugPIL:
|
| 102 |
+
- SMTRLabelEncode: # Class handling label
|
| 103 |
+
sub_str_len: *subsl
|
| 104 |
+
character_dict_path: *character_dict_path
|
| 105 |
+
use_space_char: *use_space_char
|
| 106 |
+
max_text_length: *max_text_length
|
| 107 |
+
- KeepKeys:
|
| 108 |
+
keep_keys: ['image', 'label', 'label_subs', 'label_next', 'length_subs',
|
| 109 |
+
'label_subs_pre', 'label_next_pre', 'length_subs_pre', 'length'] # dataloader will return list in this order
|
| 110 |
+
sampler:
|
| 111 |
+
name: RatioSampler
|
| 112 |
+
scales: [[128, 32]] # w, h
|
| 113 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 114 |
+
first_bs: &bs 256
|
| 115 |
+
fix_bs: false
|
| 116 |
+
divided_factor: [4, 16] # w, h
|
| 117 |
+
is_training: True
|
| 118 |
+
loader:
|
| 119 |
+
shuffle: True
|
| 120 |
+
batch_size_per_card: *bs
|
| 121 |
+
drop_last: True
|
| 122 |
+
max_ratio: &max_ratio 12
|
| 123 |
+
num_workers: 4
|
| 124 |
+
|
| 125 |
+
Eval:
|
| 126 |
+
dataset:
|
| 127 |
+
name: SimpleDataSet
|
| 128 |
+
data_dir: ../ltb/
|
| 129 |
+
label_file_list: ['../ltb/ultra_long_70_list.txt']
|
| 130 |
+
transforms:
|
| 131 |
+
- DecodeImage: # load image
|
| 132 |
+
img_mode: BGR
|
| 133 |
+
channel_first: False
|
| 134 |
+
- GTCLabelEncode: # Class handling label
|
| 135 |
+
gtc_label_encode:
|
| 136 |
+
name: ARLabelEncode
|
| 137 |
+
character_dict_path: *character_dict_path
|
| 138 |
+
use_space_char: *use_space_char
|
| 139 |
+
max_text_length: 200
|
| 140 |
+
- SliceResize:
|
| 141 |
+
image_shape: [3, 32, 128]
|
| 142 |
+
padding: False
|
| 143 |
+
max_ratio: 12
|
| 144 |
+
- KeepKeys:
|
| 145 |
+
keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
|
| 146 |
+
loader:
|
| 147 |
+
shuffle: False
|
| 148 |
+
drop_last: False
|
| 149 |
+
batch_size_per_card: 1
|
| 150 |
+
num_workers: 2
|
configs/rec/gtc/svtrv2_lnconv_smtr_gtc_stream.yml
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 60
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/svtrv2_lnconv_smtr_gtc_stream
|
| 7 |
+
save_epoch_step: [15, 1]
|
| 8 |
+
# evaluation is run every 2000 iterations
|
| 9 |
+
eval_batch_step: [0, 500]
|
| 10 |
+
eval_epoch_step: [0, 1]
|
| 11 |
+
cal_metric_during_train: True
|
| 12 |
+
pretrained_model:
|
| 13 |
+
checkpoints:
|
| 14 |
+
use_tensorboard: false
|
| 15 |
+
infer_img:
|
| 16 |
+
# for data or label process
|
| 17 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
| 18 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
| 19 |
+
max_text_length: &max_text_length 25
|
| 20 |
+
use_space_char: &use_space_char False
|
| 21 |
+
save_res_path: ./output/rec/predicts_smtr.txt
|
| 22 |
+
use_amp: True
|
| 23 |
+
distributed: true
|
| 24 |
+
grad_clip_val: 20
|
| 25 |
+
|
| 26 |
+
Optimizer:
|
| 27 |
+
name: AdamW
|
| 28 |
+
lr: 0.00065
|
| 29 |
+
weight_decay: 0.05
|
| 30 |
+
filter_bias_and_bn: True
|
| 31 |
+
|
| 32 |
+
LRScheduler:
|
| 33 |
+
name: OneCycleLR
|
| 34 |
+
warmup_epoch: 5 # pct_start 0.075*20 = 1.5ep
|
| 35 |
+
cycle_momentum: False
|
| 36 |
+
|
| 37 |
+
Architecture:
|
| 38 |
+
model_type: rec
|
| 39 |
+
algorithm: BGPD
|
| 40 |
+
in_channels: 3
|
| 41 |
+
Transform:
|
| 42 |
+
Encoder:
|
| 43 |
+
name: SVTRv2LNConvTwo33
|
| 44 |
+
use_pos_embed: False
|
| 45 |
+
out_channels: 256
|
| 46 |
+
dims: [128, 256, 384]
|
| 47 |
+
depths: [6, 6, 6]
|
| 48 |
+
num_heads: [4, 8, 12]
|
| 49 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
| 50 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
| 51 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
| 52 |
+
last_stage: false
|
| 53 |
+
feat2d: True
|
| 54 |
+
Decoder:
|
| 55 |
+
name: GTCDecoder
|
| 56 |
+
infer_gtc: True
|
| 57 |
+
detach: False
|
| 58 |
+
gtc_decoder:
|
| 59 |
+
name: SMTRDecoder
|
| 60 |
+
num_layer: 1
|
| 61 |
+
ds: True
|
| 62 |
+
max_len: *max_text_length
|
| 63 |
+
next_mode: &next True
|
| 64 |
+
sub_str_len: &subsl 5
|
| 65 |
+
infer_aug: False
|
| 66 |
+
ctc_decoder:
|
| 67 |
+
name: RCTCDecoder
|
| 68 |
+
|
| 69 |
+
Loss:
|
| 70 |
+
name: GTCLoss
|
| 71 |
+
ctc_weight: 0.25
|
| 72 |
+
gtc_loss:
|
| 73 |
+
name: SMTRLoss
|
| 74 |
+
|
| 75 |
+
PostProcess:
|
| 76 |
+
name: GTCLabelDecode
|
| 77 |
+
gtc_label_decode:
|
| 78 |
+
name: SMTRLabelDecode
|
| 79 |
+
next_mode: *next
|
| 80 |
+
character_dict_path: *character_dict_path
|
| 81 |
+
use_space_char: *use_space_char
|
| 82 |
+
only_gtc: True
|
| 83 |
+
|
| 84 |
+
Metric:
|
| 85 |
+
name: RecMetric
|
| 86 |
+
main_indicator: acc
|
| 87 |
+
is_filter: True
|
| 88 |
+
stream: True
|
| 89 |
+
|
| 90 |
+
Train:
|
| 91 |
+
dataset:
|
| 92 |
+
name: RatioDataSetTVResize
|
| 93 |
+
ds_width: True
|
| 94 |
+
padding: false
|
| 95 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
| 96 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
| 97 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
| 98 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
| 99 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
| 100 |
+
]
|
| 101 |
+
transforms:
|
| 102 |
+
- DecodeImagePIL: # load image
|
| 103 |
+
img_mode: RGB
|
| 104 |
+
- PARSeqAugPIL:
|
| 105 |
+
- SMTRLabelEncode: # Class handling label
|
| 106 |
+
sub_str_len: *subsl
|
| 107 |
+
character_dict_path: *character_dict_path
|
| 108 |
+
use_space_char: *use_space_char
|
| 109 |
+
max_text_length: *max_text_length
|
| 110 |
+
- KeepKeys:
|
| 111 |
+
keep_keys: ['image', 'label', 'label_subs', 'label_next', 'length_subs',
|
| 112 |
+
'label_subs_pre', 'label_next_pre', 'length_subs_pre', 'length'] # dataloader will return list in this order
|
| 113 |
+
sampler:
|
| 114 |
+
name: RatioSampler
|
| 115 |
+
scales: [[128, 32]] # w, h
|
| 116 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 117 |
+
first_bs: &bs 256
|
| 118 |
+
fix_bs: false
|
| 119 |
+
divided_factor: [4, 16] # w, h
|
| 120 |
+
is_training: True
|
| 121 |
+
loader:
|
| 122 |
+
shuffle: True
|
| 123 |
+
batch_size_per_card: *bs
|
| 124 |
+
drop_last: True
|
| 125 |
+
max_ratio: &max_ratio 12
|
| 126 |
+
num_workers: 4
|
| 127 |
+
|
| 128 |
+
Eval:
|
| 129 |
+
dataset:
|
| 130 |
+
name: SimpleDataSet
|
| 131 |
+
data_dir: ../ltb/
|
| 132 |
+
label_file_list: ['../ltb/ultra_long_70_list.txt']
|
| 133 |
+
transforms:
|
| 134 |
+
- DecodeImagePIL: # load image
|
| 135 |
+
img_mode: RGB
|
| 136 |
+
- GTCLabelEncode: # Class handling label
|
| 137 |
+
gtc_label_encode:
|
| 138 |
+
name: ARLabelEncode
|
| 139 |
+
character_dict_path: *character_dict_path
|
| 140 |
+
use_space_char: *use_space_char
|
| 141 |
+
max_text_length: *max_text_length
|
| 142 |
+
- SliceTVResize:
|
| 143 |
+
image_shape: [32, 128]
|
| 144 |
+
padding: False
|
| 145 |
+
max_ratio: 4
|
| 146 |
+
- KeepKeys:
|
| 147 |
+
keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
|
| 148 |
+
loader:
|
| 149 |
+
shuffle: False
|
| 150 |
+
drop_last: False
|
| 151 |
+
batch_size_per_card: 1
|
| 152 |
+
num_workers: 2
|
configs/rec/igtr/readme.md
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# IGTR
|
| 2 |
+
|
| 3 |
+
- [IGTR](#igtr)
|
| 4 |
+
- [1. Introduction](#1-introduction)
|
| 5 |
+
- [2. Environment](#2-environment)
|
| 6 |
+
- [Dataset Preparation](#dataset-preparation)
|
| 7 |
+
- [3. Model Training / Evaluation](#3-model-training--evaluation)
|
| 8 |
+
- [Citation](#citation)
|
| 9 |
+
|
| 10 |
+
<a name="1"></a>
|
| 11 |
+
|
| 12 |
+
## 1. Introduction
|
| 13 |
+
|
| 14 |
+
Paper:
|
| 15 |
+
|
| 16 |
+
> [Instruction-Guided Scene Text Recognition](https://arxiv.org/abs/2401.17851),
|
| 17 |
+
> Yongkun Du, Zhineng Chen, Yuchen Su, Caiyan Jia, Yu-Gang Jiang,
|
| 18 |
+
> TPAMI
|
| 19 |
+
|
| 20 |
+
<a name="model"></a>
|
| 21 |
+
Multi-modal models have shown appealing performance in visual recognition tasks, as free-form text-guided training evokes the ability to understand fine-grained visual content. However, current models cannot be trivially applied to scene text recognition (STR) due to the compositional difference between natural and text images. We propose a novel instruction-guided scene text recognition (IGTR) paradigm that formulates STR as an instruction learning problem and understands text images by predicting character attributes, e.g., character frequency, position, etc. IGTR first devises $\\left \\langle condition,question,answer\\right \\rangle$ instruction triplets, providing rich and diverse descriptions of character attributes. To effectively learn these attributes through question-answering, IGTR develops a lightweight instruction encoder, a cross-modal feature fusion module and a multi-task answer head, which guides nuanced text image understanding. Furthermore, IGTR realizes different recognition pipelines simply by using different instructions, enabling a character-understanding-based text reasoning paradigm that differs from current methods considerably. Experiments on English and Chinese benchmarks show that IGTR outperforms existing models by significant margins, while maintaining a small model size and fast inference speed. Moreover, by adjusting the sampling of instructions, IGTR offers an elegant way to tackle the recognition of rarely appearing and morphologically similar characters, which were previous challenges.
|
| 22 |
+
|
| 23 |
+
<a name="model"></a>
|
| 24 |
+
The accuracy (%) and model files of IGTR on the public dataset of scene text recognition are as follows:
|
| 25 |
+
|
| 26 |
+
- Trained on Synth dataset(MJ+ST), test on Common Benchmarks, training and test datasets both from [PARSeq](https://github.com/baudm/parseq).
|
| 27 |
+
|
| 28 |
+
| Model | IC13<br/>857 | SVT | IIIT5k<br/>3000 | IC15<br/>1811 | SVTP | CUTE80 | Avg | Config&Model&Log |
|
| 29 |
+
| :-----: | :----------: | :--: | :-------------: | :-----------: | :--: | :----: | :---: | :---------------------------------------------------------------------------------------------: |
|
| 30 |
+
| IGTR-PD | 97.6 | 95.2 | 97.6 | 88.4 | 91.6 | 95.5 | 94.30 | [link](https://drive.google.com/drive/folders/1Pv0CW2hiWC_dIyaB74W1fsXqiX3z5yXA?usp=drive_link) |
|
| 31 |
+
| IGTR-AR | 98.6 | 95.7 | 98.2 | 88.4 | 92.4 | 95.5 | 94.78 | as above |
|
| 32 |
+
|
| 33 |
+
- Test on Union14M-L benchmark, from [Union14M](https://github.com/Mountchicken/Union14M/).
|
| 34 |
+
|
| 35 |
+
| Model | Curve | Multi-<br/>Oriented | Artistic | Contextless | Salient | Multi-<br/>word | General | Avg | Config&Model&Log |
|
| 36 |
+
| :-----: | :---: | :-----------------: | :------: | :---------: | :-----: | :-------------: | :-----: | :---: | :---------------------: |
|
| 37 |
+
| IGTR-PD | 76.9 | 30.6 | 59.1 | 63.3 | 77.8 | 62.5 | 66.7 | 62.40 | Same as the above table |
|
| 38 |
+
| IGTR-AR | 78.4 | 31.9 | 61.3 | 66.5 | 80.2 | 69.3 | 67.9 | 65.07 | as above |
|
| 39 |
+
|
| 40 |
+
- Trained on Union14M-L training dataset.
|
| 41 |
+
|
| 42 |
+
| Model | IC13<br/>857 | SVT | IIIT5k<br/>3000 | IC15<br/>1811 | SVTP | CUTE80 | Avg | Config&Model&Log |
|
| 43 |
+
| :----------: | :----------: | :--: | :-------------: | :-----------: | :--: | :----: | :---: | :---------------------------------------------------------------------------------------------: |
|
| 44 |
+
| IGTR-PD | 97.7 | 97.7 | 98.3 | 89.8 | 93.7 | 97.9 | 95.86 | [link](https://drive.google.com/drive/folders/1ZGlzDqEzjrBg8qG2wBkbOm3bLRzFbTzo?usp=drive_link) |
|
| 45 |
+
| IGTR-AR | 98.1 | 98.4 | 98.7 | 90.5 | 94.9 | 98.3 | 96.48 | as above |
|
| 46 |
+
| IGTR-PD-60ep | 97.9 | 98.3 | 99.2 | 90.8 | 93.7 | 97.6 | 96.24 | [link](https://drive.google.com/drive/folders/1ik4hxZDRsjU1RbCA19nwE45Kg1bCnMoa?usp=drive_link) |
|
| 47 |
+
| IGTR-AR-60ep | 98.4 | 98.1 | 99.3 | 91.5 | 94.3 | 97.6 | 96.54 | as above |
|
| 48 |
+
| IGTR-PD-PT | 98.6 | 98.0 | 99.1 | 91.7 | 96.8 | 99.0 | 97.20 | [link](https://drive.google.com/drive/folders/1QM0EWV66IfYI1G0Xm066V2zJA62hH6-1?usp=drive_link) |
|
| 49 |
+
| IGTR-AR-PT | 98.8 | 98.3 | 99.2 | 92.0 | 96.8 | 99.0 | 97.34 | as above |
|
| 50 |
+
|
| 51 |
+
| Model | Curve | Multi-<br/>Oriented | Artistic | Contextless | Salient | Multi-<br/>word | General | Avg | Config&Model&Log |
|
| 52 |
+
| :----------: | :---: | :-----------------: | :------: | :---------: | :-----: | :-------------: | :-----: | :---: | :---------------------: |
|
| 53 |
+
| IGTR-PD | 88.1 | 89.9 | 74.2 | 80.3 | 82.8 | 79.2 | 83.0 | 82.51 | Same as the above table |
|
| 54 |
+
| IGTR-AR | 90.4 | 91.2 | 77.0 | 82.4 | 84.7 | 84.0 | 84.4 | 84.86 | as above |
|
| 55 |
+
| IGTR-PD-60ep | 90.0 | 92.1 | 77.5 | 82.8 | 86.0 | 83.0 | 84.8 | 85.18 | Same as the above table |
|
| 56 |
+
| IGTR-AR-60ep | 91.0 | 93.0 | 78.7 | 84.6 | 87.3 | 84.8 | 85.6 | 86.43 | as above |
|
| 57 |
+
| IGTR-PD-PT | 92.4 | 92.1 | 80.7 | 83.6 | 87.7 | 86.9 | 85.0 | 86.92 | Same as the above table |
|
| 58 |
+
| IGTR-AR-PT | 93.0 | 92.9 | 81.3 | 83.4 | 88.6 | 88.7 | 85.6 | 87.65 | as above |
|
| 59 |
+
|
| 60 |
+
- Trained and test on Chinese dataset, from [Chinese Benckmark](https://github.com/FudanVI/benchmarking-chinese-text-recognition).
|
| 61 |
+
|
| 62 |
+
| Model | Scene | Web | Document | Handwriting | Avg | Config&Model&Log |
|
| 63 |
+
| :---------: | :---: | :--: | :------: | :---------: | :---: | :---------------------------------------------------------------------------------------------: |
|
| 64 |
+
| IGTR-PD | 73.1 | 74.8 | 98.6 | 52.5 | 74.75 | |
|
| 65 |
+
| IGTR-AR | 75.1 | 76.4 | 98.7 | 55.3 | 76.37 | |
|
| 66 |
+
| IGTR-PD-TS | 73.5 | 75.9 | 98.7 | 54.5 | 75.65 | [link](https://drive.google.com/drive/folders/1H3VRdGHjhawd6fkSC-qlBzVzvYYTpHRg?usp=drive_link) |
|
| 67 |
+
| IGTR-AR-TS | 75.6 | 77.0 | 98.8 | 57.3 | 77.17 | as above |
|
| 68 |
+
| IGTR-PD-Aug | 79.5 | 80.0 | 99.4 | 58.9 | 79.45 | [link](https://drive.google.com/drive/folders/1XFQkCILwcFwA7iYyQY9crnrouaI5sqcZ?usp=drive_link) |
|
| 69 |
+
| IGTR-AR-Aug | 82.0 | 81.7 | 99.5 | 63.8 | 81.74 | as above |
|
| 70 |
+
|
| 71 |
+
Download all Configs, Models, and Logs from [Google Drive](https://drive.google.com/drive/folders/1mSRDg9Mj5R6PspAdFGXZHDHTCQmjkd8d?usp=drive_link).
|
| 72 |
+
|
| 73 |
+
<a name="2"></a>
|
| 74 |
+
|
| 75 |
+
## 2. Environment
|
| 76 |
+
|
| 77 |
+
- [PyTorch](http://pytorch.org/) version >= 1.13.0
|
| 78 |
+
- Python version >= 3.7
|
| 79 |
+
|
| 80 |
+
```shell
|
| 81 |
+
git clone -b develop https://github.com/Topdu/OpenOCR.git
|
| 82 |
+
cd OpenOCR
|
| 83 |
+
# A100 Ubuntu 20.04 Cuda 11.8
|
| 84 |
+
conda create -n openocr python==3.8
|
| 85 |
+
conda activate openocr
|
| 86 |
+
conda install pytorch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 pytorch-cuda=11.8 -c pytorch -c nvidia
|
| 87 |
+
pip install -r requirements.txt
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
#### Dataset Preparation
|
| 91 |
+
|
| 92 |
+
- [English dataset download](https://github.com/baudm/parseq)
|
| 93 |
+
|
| 94 |
+
- [Union14M-L-LMDB-Filtered download](https://drive.google.com/drive/folders/1OlDWJZgvd6s4S09S3IGeAI90jI0i7AB_?usp=sharing)
|
| 95 |
+
|
| 96 |
+
- [Chinese dataset download](https://github.com/fudanvi/benchmarking-chinese-text-recognition#download)
|
| 97 |
+
|
| 98 |
+
The expected filesystem structure is as follows:
|
| 99 |
+
|
| 100 |
+
```
|
| 101 |
+
benchmark_bctr
|
| 102 |
+
├── benchmark_bctr_test
|
| 103 |
+
│ ├── document_test
|
| 104 |
+
│ ├── handwriting_test
|
| 105 |
+
│ ├── scene_test
|
| 106 |
+
│ └── web_test
|
| 107 |
+
└── benchmark_bctr_train
|
| 108 |
+
├── document_train
|
| 109 |
+
├── handwriting_train
|
| 110 |
+
├── scene_train
|
| 111 |
+
└── web_train
|
| 112 |
+
evaluation
|
| 113 |
+
├── CUTE80
|
| 114 |
+
├── IC13_857
|
| 115 |
+
├── IC15_1811
|
| 116 |
+
├── IIIT5k
|
| 117 |
+
├── SVT
|
| 118 |
+
└── SVTP
|
| 119 |
+
OpenOCR
|
| 120 |
+
synth
|
| 121 |
+
├── MJ
|
| 122 |
+
│ ├── test
|
| 123 |
+
│ ├── train
|
| 124 |
+
│ └── val
|
| 125 |
+
└── ST
|
| 126 |
+
test # from PARSeq
|
| 127 |
+
├── ArT
|
| 128 |
+
├── COCOv1.4
|
| 129 |
+
├── CUTE80
|
| 130 |
+
├── IC13_1015
|
| 131 |
+
├── IC13_1095
|
| 132 |
+
├── IC13_857
|
| 133 |
+
├── IC15_1811
|
| 134 |
+
├── IC15_2077
|
| 135 |
+
├── IIIT5k
|
| 136 |
+
├── SVT
|
| 137 |
+
├── SVTP
|
| 138 |
+
└── Uber
|
| 139 |
+
u14m # lmdb format
|
| 140 |
+
├── artistic
|
| 141 |
+
├── contextless
|
| 142 |
+
├── curve
|
| 143 |
+
├── general
|
| 144 |
+
├── multi_oriented
|
| 145 |
+
├── multi_words
|
| 146 |
+
└── salient
|
| 147 |
+
Union14M-L-LMDB-Filtered # lmdb format
|
| 148 |
+
├── train_challenging
|
| 149 |
+
├── train_easy
|
| 150 |
+
├── train_hard
|
| 151 |
+
├── train_medium
|
| 152 |
+
└── train_normal
|
| 153 |
+
```
|
| 154 |
+
|
| 155 |
+
<a name="3"></a>
|
| 156 |
+
|
| 157 |
+
## 3. Model Training / Evaluation
|
| 158 |
+
|
| 159 |
+
Training:
|
| 160 |
+
|
| 161 |
+
```shell
|
| 162 |
+
# The configuration file is available from the link provided in the table above.
|
| 163 |
+
# Multi GPU training
|
| 164 |
+
CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 tools/train_rec.py --c PATH/svtr_base_igtr_XXX.yml
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
Evaluation:
|
| 168 |
+
|
| 169 |
+
```shell
|
| 170 |
+
# The configuration file is available from the link provided in the table above.
|
| 171 |
+
# en
|
| 172 |
+
python tools/eval_rec_all_en.py --c PATH/svtr_base_igtr_syn.yml
|
| 173 |
+
# ch
|
| 174 |
+
python tools/eval_rec_all_ch.py --c PATH/svtr_base_igtr_ch_aug.yml
|
| 175 |
+
```
|
| 176 |
+
|
| 177 |
+
## Citation
|
| 178 |
+
|
| 179 |
+
If you find our method useful for your reserach, please cite:
|
| 180 |
+
|
| 181 |
+
```bibtex
|
| 182 |
+
@article{Du2024IGTR,
|
| 183 |
+
title = {Instruction-Guided Scene Text Recognition},
|
| 184 |
+
author = {Du, Yongkun and Chen, Zhineng and Su, Yuchen and Jia, Caiyan and Jiang, Yu-Gang},
|
| 185 |
+
journal = {CoRR},
|
| 186 |
+
eprinttype = {arXiv},
|
| 187 |
+
primaryClass={cs.CV},
|
| 188 |
+
volume = {abs/2401.17851},
|
| 189 |
+
year = {2024},
|
| 190 |
+
url = {https://arxiv.org/abs/2401.17851}
|
| 191 |
+
}
|
| 192 |
+
```
|
configs/rec/igtr/svtr_base_ds_igtr.yml
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/svtr_base_igtr
|
| 7 |
+
save_epoch_step: [15, 1]
|
| 8 |
+
# evaluation is run every 2000 iterations
|
| 9 |
+
eval_batch_step: [0, 500]
|
| 10 |
+
eval_epoch_step: [0, 1]
|
| 11 |
+
cal_metric_during_train: True
|
| 12 |
+
pretrained_model:
|
| 13 |
+
checkpoints:
|
| 14 |
+
use_tensorboard: false
|
| 15 |
+
infer_img:
|
| 16 |
+
# for data or label process
|
| 17 |
+
character_dict_path: &character_dict_path
|
| 18 |
+
# ./tools/utils/EN_symbol_dict.txt # 96en
|
| 19 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
| 20 |
+
max_text_length: &max_text_length 25
|
| 21 |
+
use_space_char: &use_space_char False
|
| 22 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_igtr.txt
|
| 23 |
+
use_amp: True
|
| 24 |
+
|
| 25 |
+
Optimizer:
|
| 26 |
+
name: AdamW
|
| 27 |
+
lr: 0.0005 # 2gpus 384bs/gpu
|
| 28 |
+
weight_decay: 0.05
|
| 29 |
+
filter_bias_and_bn: True
|
| 30 |
+
|
| 31 |
+
LRScheduler:
|
| 32 |
+
name: OneCycleLR
|
| 33 |
+
warmup_epoch: 1.5
|
| 34 |
+
cycle_momentum: False
|
| 35 |
+
|
| 36 |
+
Architecture:
|
| 37 |
+
model_type: rec
|
| 38 |
+
algorithm: IGTR
|
| 39 |
+
in_channels: 3
|
| 40 |
+
Transform:
|
| 41 |
+
Encoder:
|
| 42 |
+
name: SVTRNet2DPos
|
| 43 |
+
img_size: [32, -1]
|
| 44 |
+
out_char_num: 25
|
| 45 |
+
out_channels: 256
|
| 46 |
+
patch_merging: 'Conv'
|
| 47 |
+
embed_dim: [128, 256, 384]
|
| 48 |
+
depth: [6, 6, 6]
|
| 49 |
+
num_heads: [4, 8, 12]
|
| 50 |
+
mixer: ['ConvB','ConvB','ConvB','ConvB','ConvB','ConvB', 'ConvB','ConvB', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
|
| 51 |
+
local_mixer: [[5, 5], [5, 5], [5, 5]]
|
| 52 |
+
last_stage: False
|
| 53 |
+
prenorm: True
|
| 54 |
+
use_first_sub: False
|
| 55 |
+
Decoder:
|
| 56 |
+
name: IGTRDecoder
|
| 57 |
+
dim: 384
|
| 58 |
+
num_layer: 1
|
| 59 |
+
ar: False
|
| 60 |
+
refine_iter: 0
|
| 61 |
+
# next_pred: True
|
| 62 |
+
next_pred: False
|
| 63 |
+
pos2d: True
|
| 64 |
+
ds: True
|
| 65 |
+
# pos_len: False
|
| 66 |
+
# rec_layer: 1
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
Loss:
|
| 70 |
+
name: IGTRLoss
|
| 71 |
+
|
| 72 |
+
PostProcess:
|
| 73 |
+
name: IGTRLabelDecode
|
| 74 |
+
character_dict_path: *character_dict_path
|
| 75 |
+
use_space_char: *use_space_char
|
| 76 |
+
|
| 77 |
+
Metric:
|
| 78 |
+
name: RecMetric
|
| 79 |
+
main_indicator: acc
|
| 80 |
+
|
| 81 |
+
Train:
|
| 82 |
+
dataset:
|
| 83 |
+
name: RatioDataSet
|
| 84 |
+
ds_width: True
|
| 85 |
+
padding: &padding False
|
| 86 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
| 87 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
| 88 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
| 89 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
| 90 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
| 91 |
+
]
|
| 92 |
+
transforms:
|
| 93 |
+
- DecodeImage: # load image
|
| 94 |
+
img_mode: BGR
|
| 95 |
+
channel_first: False
|
| 96 |
+
- PARSeqAug:
|
| 97 |
+
- IGTRLabelEncode: # Class handling label
|
| 98 |
+
k: 8
|
| 99 |
+
prompt_error: False
|
| 100 |
+
character_dict_path: *character_dict_path
|
| 101 |
+
use_space_char: *use_space_char
|
| 102 |
+
max_text_length: *max_text_length
|
| 103 |
+
- KeepKeys:
|
| 104 |
+
keep_keys: ['image', 'label', 'prompt_pos_idx_list',
|
| 105 |
+
'prompt_char_idx_list', 'ques_pos_idx_list', 'ques1_answer_list',
|
| 106 |
+
'ques2_char_idx_list', 'ques2_answer_list', 'ques3_answer', 'ques4_char_num_list',
|
| 107 |
+
'ques_len_list', 'ques2_len_list', 'prompt_len_list', 'length'] # dataloader will return list in this order
|
| 108 |
+
sampler:
|
| 109 |
+
name: RatioSampler
|
| 110 |
+
scales: [[128, 32]] # w, h
|
| 111 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 112 |
+
first_bs: &bs 384
|
| 113 |
+
fix_bs: false
|
| 114 |
+
divided_factor: [4, 16] # w, h
|
| 115 |
+
is_training: True
|
| 116 |
+
loader:
|
| 117 |
+
shuffle: True
|
| 118 |
+
batch_size_per_card: *bs
|
| 119 |
+
drop_last: True
|
| 120 |
+
max_ratio: &max_ratio 4
|
| 121 |
+
num_workers: 4
|
| 122 |
+
|
| 123 |
+
Eval:
|
| 124 |
+
dataset:
|
| 125 |
+
name: RatioDataSet
|
| 126 |
+
ds_width: True
|
| 127 |
+
padding: *padding
|
| 128 |
+
data_dir_list: ['../evaluation/CUTE80',
|
| 129 |
+
'../evaluation/IC13_857',
|
| 130 |
+
'../evaluation/IC15_1811',
|
| 131 |
+
'../evaluation/IIIT5k',
|
| 132 |
+
'../evaluation/SVT',
|
| 133 |
+
'../evaluation/SVTP']
|
| 134 |
+
transforms:
|
| 135 |
+
- DecodeImage: # load image
|
| 136 |
+
img_mode: BGR
|
| 137 |
+
channel_first: False
|
| 138 |
+
- ARLabelEncode: # Class handling label
|
| 139 |
+
character_dict_path: *character_dict_path
|
| 140 |
+
use_space_char: *use_space_char
|
| 141 |
+
max_text_length: *max_text_length
|
| 142 |
+
- KeepKeys:
|
| 143 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 144 |
+
sampler:
|
| 145 |
+
name: RatioSampler
|
| 146 |
+
scales: [[128, 32]] # w, h
|
| 147 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 148 |
+
first_bs: 256
|
| 149 |
+
fix_bs: false
|
| 150 |
+
divided_factor: [4, 16] # w, h
|
| 151 |
+
is_training: False
|
| 152 |
+
loader:
|
| 153 |
+
shuffle: False
|
| 154 |
+
drop_last: False
|
| 155 |
+
batch_size_per_card: 256
|
| 156 |
+
max_ratio: *max_ratio
|
| 157 |
+
num_workers: 4
|
configs/rec/lister/focalsvtr_lister_wo_fem_maxratio12.yml
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/focalsvtr_lister_wo_fem_maxratio12/
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: True
|
| 10 |
+
pretrained_model:
|
| 11 |
+
checkpoints:
|
| 12 |
+
use_tensorboard: false
|
| 13 |
+
infer_img:
|
| 14 |
+
# for data or label process
|
| 15 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
|
| 16 |
+
max_text_length: &max_text_length 25
|
| 17 |
+
use_space_char: &use_space_char False
|
| 18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_focalsvtr_lister_wo_fem_maxratio12.txt
|
| 19 |
+
use_amp: True
|
| 20 |
+
grad_clip_val: 20
|
| 21 |
+
|
| 22 |
+
Optimizer:
|
| 23 |
+
name: AdamW
|
| 24 |
+
lr: 0.00065
|
| 25 |
+
weight_decay: 0.05
|
| 26 |
+
filter_bias_and_bn: True
|
| 27 |
+
|
| 28 |
+
LRScheduler:
|
| 29 |
+
name: OneCycleLR
|
| 30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
| 31 |
+
cycle_momentum: False
|
| 32 |
+
|
| 33 |
+
Architecture:
|
| 34 |
+
model_type: rec
|
| 35 |
+
algorithm: LISTER
|
| 36 |
+
Transform:
|
| 37 |
+
Encoder:
|
| 38 |
+
name: FocalSVTR
|
| 39 |
+
img_size: [32, 128]
|
| 40 |
+
depths: [6, 6, 9]
|
| 41 |
+
embed_dim: 96
|
| 42 |
+
sub_k: [[1, 1], [2, 1], [1, 1]]
|
| 43 |
+
focal_levels: [3, 3, 3]
|
| 44 |
+
last_stage: False
|
| 45 |
+
feat2d: True
|
| 46 |
+
Decoder:
|
| 47 |
+
name: LISTERDecoder
|
| 48 |
+
detach_grad: False
|
| 49 |
+
attn_scaling: True
|
| 50 |
+
use_fem: False
|
| 51 |
+
|
| 52 |
+
Loss:
|
| 53 |
+
name: LISTERLoss
|
| 54 |
+
|
| 55 |
+
PostProcess:
|
| 56 |
+
name: LISTERLabelDecode
|
| 57 |
+
|
| 58 |
+
Metric:
|
| 59 |
+
name: RecMetric
|
| 60 |
+
main_indicator: acc
|
| 61 |
+
is_filter: True
|
| 62 |
+
|
| 63 |
+
Train:
|
| 64 |
+
dataset:
|
| 65 |
+
name: RatioDataSetTVResize
|
| 66 |
+
ds_width: True
|
| 67 |
+
padding: False
|
| 68 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_filter_train_challenging',
|
| 69 |
+
'../Union14M-L-LMDB-Filtered/filter_filter_train_hard',
|
| 70 |
+
'../Union14M-L-LMDB-Filtered/filter_filter_train_medium',
|
| 71 |
+
'../Union14M-L-LMDB-Filtered/filter_filter_train_normal',
|
| 72 |
+
'../Union14M-L-LMDB-Filtered/filter_filter_train_easy',
|
| 73 |
+
]
|
| 74 |
+
transforms:
|
| 75 |
+
- DecodeImagePIL: # load image
|
| 76 |
+
img_mode: RGB
|
| 77 |
+
- PARSeqAugPIL:
|
| 78 |
+
- EPLabelEncode: # Class handling label
|
| 79 |
+
character_dict_path: *character_dict_path
|
| 80 |
+
use_space_char: *use_space_char
|
| 81 |
+
max_text_length: *max_text_length
|
| 82 |
+
- KeepKeys:
|
| 83 |
+
keep_keys: ['image', 'label', 'length']
|
| 84 |
+
sampler:
|
| 85 |
+
name: RatioSampler
|
| 86 |
+
scales: [[128, 32]] # w, h
|
| 87 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 88 |
+
first_bs: &bs 256
|
| 89 |
+
fix_bs: false
|
| 90 |
+
divided_factor: [4, 16] # w, h
|
| 91 |
+
is_training: True
|
| 92 |
+
loader:
|
| 93 |
+
shuffle: True
|
| 94 |
+
batch_size_per_card: *bs
|
| 95 |
+
drop_last: True
|
| 96 |
+
max_ratio: 12
|
| 97 |
+
num_workers: 4
|
| 98 |
+
|
| 99 |
+
Eval:
|
| 100 |
+
dataset:
|
| 101 |
+
name: RatioDataSetTVResize
|
| 102 |
+
ds_width: True
|
| 103 |
+
padding: False
|
| 104 |
+
data_dir_list: ['../evaluation/CUTE80',
|
| 105 |
+
'../evaluation/IC13_857',
|
| 106 |
+
'../evaluation/IC15_1811',
|
| 107 |
+
'../evaluation/IIIT5k',
|
| 108 |
+
'../evaluation/SVT',
|
| 109 |
+
'../evaluation/SVTP',
|
| 110 |
+
]
|
| 111 |
+
transforms:
|
| 112 |
+
- DecodeImagePIL: # load image
|
| 113 |
+
img_mode: RGB
|
| 114 |
+
- EPLabelEncode: # Class handling label
|
| 115 |
+
character_dict_path: *character_dict_path
|
| 116 |
+
use_space_char: *use_space_char
|
| 117 |
+
max_text_length: *max_text_length
|
| 118 |
+
- KeepKeys:
|
| 119 |
+
keep_keys: ['image', 'label', 'length']
|
| 120 |
+
sampler:
|
| 121 |
+
name: RatioSampler
|
| 122 |
+
scales: [[128, 32]] # w, h
|
| 123 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 124 |
+
first_bs: 256
|
| 125 |
+
fix_bs: false
|
| 126 |
+
divided_factor: [4, 16] # w, h
|
| 127 |
+
is_training: False
|
| 128 |
+
loader:
|
| 129 |
+
shuffle: False
|
| 130 |
+
drop_last: False
|
| 131 |
+
batch_size_per_card: *bs
|
| 132 |
+
max_ratio: 12
|
| 133 |
+
num_workers: 4
|
configs/rec/lister/svtrv2_lister_wo_fem_maxratio12.yml
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_lister_wo_fem_maxratio12/
|
| 7 |
+
eval_epoch_step: [0, 1]
|
| 8 |
+
eval_batch_step: [0, 500]
|
| 9 |
+
cal_metric_during_train: True
|
| 10 |
+
pretrained_model:
|
| 11 |
+
checkpoints:
|
| 12 |
+
use_tensorboard: false
|
| 13 |
+
infer_img:
|
| 14 |
+
# for data or label process
|
| 15 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
|
| 16 |
+
max_text_length: &max_text_length 25
|
| 17 |
+
use_space_char: &use_space_char False
|
| 18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_lister_wo_fem_maxratio12.txt
|
| 19 |
+
use_amp: True
|
| 20 |
+
grad_clip_val: 20
|
| 21 |
+
|
| 22 |
+
Optimizer:
|
| 23 |
+
name: AdamW
|
| 24 |
+
lr: 0.000325
|
| 25 |
+
weight_decay: 0.05
|
| 26 |
+
filter_bias_and_bn: True
|
| 27 |
+
|
| 28 |
+
LRScheduler:
|
| 29 |
+
name: OneCycleLR
|
| 30 |
+
|
| 31 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
| 32 |
+
cycle_momentum: False
|
| 33 |
+
|
| 34 |
+
Architecture:
|
| 35 |
+
model_type: rec
|
| 36 |
+
algorithm: LISTER
|
| 37 |
+
Transform:
|
| 38 |
+
Encoder:
|
| 39 |
+
name: SVTRv2LNConvTwo33
|
| 40 |
+
use_pos_embed: False
|
| 41 |
+
out_channels: 256
|
| 42 |
+
dims: [128, 256, 384]
|
| 43 |
+
depths: [6, 6, 6]
|
| 44 |
+
num_heads: [4, 8, 12]
|
| 45 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
| 46 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
| 47 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
| 48 |
+
last_stage: false
|
| 49 |
+
feat2d: True
|
| 50 |
+
Decoder:
|
| 51 |
+
name: LISTERDecoder
|
| 52 |
+
detach_grad: False
|
| 53 |
+
attn_scaling: True
|
| 54 |
+
use_fem: False
|
| 55 |
+
|
| 56 |
+
Loss:
|
| 57 |
+
name: LISTERLoss
|
| 58 |
+
|
| 59 |
+
PostProcess:
|
| 60 |
+
name: LISTERLabelDecode
|
| 61 |
+
|
| 62 |
+
Metric:
|
| 63 |
+
name: RecMetric
|
| 64 |
+
main_indicator: acc
|
| 65 |
+
is_filter: True
|
| 66 |
+
|
| 67 |
+
Train:
|
| 68 |
+
dataset:
|
| 69 |
+
name: RatioDataSetTVResize
|
| 70 |
+
ds_width: True
|
| 71 |
+
padding: False
|
| 72 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
| 73 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
| 74 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
| 75 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
| 76 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
| 77 |
+
]
|
| 78 |
+
transforms:
|
| 79 |
+
- DecodeImagePIL: # load image
|
| 80 |
+
img_mode: RGB
|
| 81 |
+
- PARSeqAugPIL:
|
| 82 |
+
- EPLabelEncode: # Class handling label
|
| 83 |
+
character_dict_path: *character_dict_path
|
| 84 |
+
use_space_char: *use_space_char
|
| 85 |
+
max_text_length: *max_text_length
|
| 86 |
+
- KeepKeys:
|
| 87 |
+
keep_keys: ['image', 'label', 'length']
|
| 88 |
+
sampler:
|
| 89 |
+
name: RatioSampler
|
| 90 |
+
scales: [[128, 32]] # w, h
|
| 91 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 92 |
+
first_bs: &bs 128
|
| 93 |
+
fix_bs: false
|
| 94 |
+
divided_factor: [4, 16] # w, h
|
| 95 |
+
is_training: True
|
| 96 |
+
loader:
|
| 97 |
+
shuffle: True
|
| 98 |
+
batch_size_per_card: *bs
|
| 99 |
+
drop_last: True
|
| 100 |
+
max_ratio: 12
|
| 101 |
+
num_workers: 4
|
| 102 |
+
|
| 103 |
+
Eval:
|
| 104 |
+
dataset:
|
| 105 |
+
name: RatioDataSetTVResize
|
| 106 |
+
ds_width: True
|
| 107 |
+
padding: False
|
| 108 |
+
data_dir_list: ['../evaluation/CUTE80',
|
| 109 |
+
'../evaluation/IC13_857',
|
| 110 |
+
'../evaluation/IC15_1811',
|
| 111 |
+
'../evaluation/IIIT5k',
|
| 112 |
+
'../evaluation/SVT',
|
| 113 |
+
'../evaluation/SVTP',
|
| 114 |
+
]
|
| 115 |
+
transforms:
|
| 116 |
+
- DecodeImagePIL: # load image
|
| 117 |
+
img_mode: RGB
|
| 118 |
+
- EPLabelEncode: # Class handling label
|
| 119 |
+
character_dict_path: *character_dict_path
|
| 120 |
+
use_space_char: *use_space_char
|
| 121 |
+
max_text_length: *max_text_length
|
| 122 |
+
|
| 123 |
+
- KeepKeys:
|
| 124 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 125 |
+
sampler:
|
| 126 |
+
name: RatioSampler
|
| 127 |
+
scales: [[128, 32]] # w, h
|
| 128 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 129 |
+
first_bs: 256
|
| 130 |
+
fix_bs: false
|
| 131 |
+
divided_factor: [4, 16] # w, h
|
| 132 |
+
is_training: False
|
| 133 |
+
loader:
|
| 134 |
+
shuffle: False
|
| 135 |
+
drop_last: False
|
| 136 |
+
batch_size_per_card: *bs
|
| 137 |
+
max_ratio: 12
|
| 138 |
+
num_workers: 4
|
configs/rec/lpv/svtr_base_lpv.yml
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/svtr_base_lpv/
|
| 7 |
+
save_epoch_step: [15, 1]
|
| 8 |
+
# evaluation is run every 2000 iterations
|
| 9 |
+
eval_batch_step: [0, 500]
|
| 10 |
+
eval_epoch_step: [0, 1]
|
| 11 |
+
cal_metric_during_train: True
|
| 12 |
+
pretrained_model:
|
| 13 |
+
# ./output/rec/u14m_filter/svtr_base_lpv_wo_glrm/best.pth
|
| 14 |
+
checkpoints:
|
| 15 |
+
use_tensorboard: false
|
| 16 |
+
infer_img:
|
| 17 |
+
# for data or label process
|
| 18 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
| 19 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
| 20 |
+
max_text_length: &max_text_length 25
|
| 21 |
+
use_space_char: &use_space_char False
|
| 22 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtr_lpv.txt
|
| 23 |
+
use_amp: True
|
| 24 |
+
grad_clip_val: 20
|
| 25 |
+
|
| 26 |
+
Optimizer:
|
| 27 |
+
name: Adam
|
| 28 |
+
lr: 0.0001 # for 4gpus bs128/gpu
|
| 29 |
+
weight_decay: 0.0
|
| 30 |
+
filter_bias_and_bn: False
|
| 31 |
+
betas: [0.9, 0.99]
|
| 32 |
+
|
| 33 |
+
LRScheduler:
|
| 34 |
+
name: MultiStepLR
|
| 35 |
+
milestones: [12]
|
| 36 |
+
gamma: 0.1
|
| 37 |
+
|
| 38 |
+
Architecture:
|
| 39 |
+
model_type: rec
|
| 40 |
+
algorithm: LPV
|
| 41 |
+
in_channels: 3
|
| 42 |
+
Transform:
|
| 43 |
+
Encoder:
|
| 44 |
+
name: SVTRNet
|
| 45 |
+
img_size: [32, 128]
|
| 46 |
+
out_char_num: 25
|
| 47 |
+
out_channels: 256
|
| 48 |
+
patch_merging: 'Conv'
|
| 49 |
+
embed_dim: [128, 256, 384]
|
| 50 |
+
depth: [6, 6, 6]
|
| 51 |
+
num_heads: [4, 8, 12]
|
| 52 |
+
mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
|
| 53 |
+
local_mixer: [[5, 5], [5, 5], [5, 5]]
|
| 54 |
+
sub_k: [[1, 1], [1, 1]]
|
| 55 |
+
feature2d: True
|
| 56 |
+
last_stage: False
|
| 57 |
+
prenorm: True
|
| 58 |
+
Decoder:
|
| 59 |
+
name: LPVDecoder
|
| 60 |
+
num_layer: 3
|
| 61 |
+
max_len: *max_text_length
|
| 62 |
+
use_mask: True
|
| 63 |
+
dim_feedforward: 1536
|
| 64 |
+
nhead: 12
|
| 65 |
+
dropout: 0.1
|
| 66 |
+
trans_layer: 3
|
| 67 |
+
|
| 68 |
+
Loss:
|
| 69 |
+
name: LPVLoss
|
| 70 |
+
|
| 71 |
+
PostProcess:
|
| 72 |
+
name: ARLabelDecode
|
| 73 |
+
character_dict_path: *character_dict_path
|
| 74 |
+
use_space_char: *use_space_char
|
| 75 |
+
|
| 76 |
+
Metric:
|
| 77 |
+
name: RecMetric
|
| 78 |
+
main_indicator: acc
|
| 79 |
+
is_filter: True
|
| 80 |
+
|
| 81 |
+
Train:
|
| 82 |
+
dataset:
|
| 83 |
+
name: LMDBDataSet
|
| 84 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
| 85 |
+
transforms:
|
| 86 |
+
- DecodeImagePIL: # load image
|
| 87 |
+
img_mode: RGB
|
| 88 |
+
- PARSeqAugPIL:
|
| 89 |
+
- ARLabelEncode: # Class handling label
|
| 90 |
+
character_dict_path: *character_dict_path
|
| 91 |
+
use_space_char: *use_space_char
|
| 92 |
+
max_text_length: *max_text_length
|
| 93 |
+
- RecTVResize:
|
| 94 |
+
image_shape: [32, 128]
|
| 95 |
+
padding: False
|
| 96 |
+
- KeepKeys:
|
| 97 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 98 |
+
loader:
|
| 99 |
+
shuffle: True
|
| 100 |
+
batch_size_per_card: 128
|
| 101 |
+
drop_last: True
|
| 102 |
+
num_workers: 4
|
| 103 |
+
|
| 104 |
+
Eval:
|
| 105 |
+
dataset:
|
| 106 |
+
name: LMDBDataSet
|
| 107 |
+
data_dir: ../evaluation/
|
| 108 |
+
transforms:
|
| 109 |
+
- DecodeImagePIL: # load image
|
| 110 |
+
img_mode: RGB
|
| 111 |
+
- ARLabelEncode: # Class handling label
|
| 112 |
+
character_dict_path: *character_dict_path
|
| 113 |
+
use_space_char: *use_space_char
|
| 114 |
+
max_text_length: *max_text_length
|
| 115 |
+
- RecTVResize:
|
| 116 |
+
image_shape: [32, 128]
|
| 117 |
+
padding: False
|
| 118 |
+
- KeepKeys:
|
| 119 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 120 |
+
loader:
|
| 121 |
+
shuffle: False
|
| 122 |
+
drop_last: False
|
| 123 |
+
batch_size_per_card: 128
|
| 124 |
+
num_workers: 4
|
configs/rec/lpv/svtr_base_lpv_wo_glrm.yml
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/svtr_base_lpv_wo_glrm/
|
| 7 |
+
save_epoch_step: [15, 1]
|
| 8 |
+
# evaluation is run every 2000 iterations
|
| 9 |
+
eval_batch_step: [0, 500]
|
| 10 |
+
eval_epoch_step: [0, 1]
|
| 11 |
+
cal_metric_during_train: True
|
| 12 |
+
pretrained_model:
|
| 13 |
+
checkpoints:
|
| 14 |
+
use_tensorboard: false
|
| 15 |
+
infer_img:
|
| 16 |
+
# for data or label process
|
| 17 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
| 18 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
| 19 |
+
max_text_length: &max_text_length 25
|
| 20 |
+
use_space_char: &use_space_char False
|
| 21 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_lpv_wo_glrm.txt
|
| 22 |
+
use_amp: True
|
| 23 |
+
grad_clip_val: 20
|
| 24 |
+
|
| 25 |
+
Optimizer:
|
| 26 |
+
name: Adam
|
| 27 |
+
lr: 0.0001 # for 4gpus bs128/gpu
|
| 28 |
+
weight_decay: 0.0
|
| 29 |
+
filter_bias_and_bn: False
|
| 30 |
+
betas: [0.9, 0.99]
|
| 31 |
+
|
| 32 |
+
LRScheduler:
|
| 33 |
+
name: MultiStepLR
|
| 34 |
+
milestones: [12]
|
| 35 |
+
gamma: 0.1
|
| 36 |
+
|
| 37 |
+
Architecture:
|
| 38 |
+
model_type: rec
|
| 39 |
+
algorithm: LPV
|
| 40 |
+
in_channels: 3
|
| 41 |
+
Transform:
|
| 42 |
+
Encoder:
|
| 43 |
+
name: SVTRNet
|
| 44 |
+
img_size: [32, 128]
|
| 45 |
+
out_char_num: 25
|
| 46 |
+
out_channels: 256
|
| 47 |
+
patch_merging: 'Conv'
|
| 48 |
+
embed_dim: [128, 256, 384]
|
| 49 |
+
depth: [6, 6, 6]
|
| 50 |
+
num_heads: [4, 8, 12]
|
| 51 |
+
mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
|
| 52 |
+
local_mixer: [[5, 5], [5, 5], [5, 5]]
|
| 53 |
+
sub_k: [[1, 1], [1, 1]]
|
| 54 |
+
feature2d: True
|
| 55 |
+
last_stage: False
|
| 56 |
+
prenorm: True
|
| 57 |
+
Decoder:
|
| 58 |
+
name: LPVDecoder
|
| 59 |
+
num_layer: 3
|
| 60 |
+
max_len: *max_text_length
|
| 61 |
+
use_mask: False
|
| 62 |
+
dim_feedforward: 1536
|
| 63 |
+
nhead: 12
|
| 64 |
+
dropout: 0.1
|
| 65 |
+
trans_layer: 3
|
| 66 |
+
|
| 67 |
+
Loss:
|
| 68 |
+
name: LPVLoss
|
| 69 |
+
|
| 70 |
+
PostProcess:
|
| 71 |
+
name: ARLabelDecode
|
| 72 |
+
character_dict_path: *character_dict_path
|
| 73 |
+
use_space_char: *use_space_char
|
| 74 |
+
|
| 75 |
+
Metric:
|
| 76 |
+
name: RecMetric
|
| 77 |
+
main_indicator: acc
|
| 78 |
+
is_filter: True
|
| 79 |
+
|
| 80 |
+
Train:
|
| 81 |
+
dataset:
|
| 82 |
+
name: LMDBDataSet
|
| 83 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
| 84 |
+
transforms:
|
| 85 |
+
- DecodeImagePIL: # load image
|
| 86 |
+
img_mode: RGB
|
| 87 |
+
- PARSeqAugPIL:
|
| 88 |
+
- ARLabelEncode: # Class handling label
|
| 89 |
+
character_dict_path: *character_dict_path
|
| 90 |
+
use_space_char: *use_space_char
|
| 91 |
+
max_text_length: *max_text_length
|
| 92 |
+
- RecTVResize:
|
| 93 |
+
image_shape: [32, 128]
|
| 94 |
+
padding: False
|
| 95 |
+
- KeepKeys:
|
| 96 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 97 |
+
loader:
|
| 98 |
+
shuffle: True
|
| 99 |
+
batch_size_per_card: 128
|
| 100 |
+
drop_last: True
|
| 101 |
+
num_workers: 4
|
| 102 |
+
|
| 103 |
+
Eval:
|
| 104 |
+
dataset:
|
| 105 |
+
name: LMDBDataSet
|
| 106 |
+
data_dir: ../evaluation/
|
| 107 |
+
transforms:
|
| 108 |
+
- DecodeImagePIL: # load image
|
| 109 |
+
img_mode: RGB
|
| 110 |
+
- ARLabelEncode: # Class handling label
|
| 111 |
+
character_dict_path: *character_dict_path
|
| 112 |
+
use_space_char: *use_space_char
|
| 113 |
+
max_text_length: *max_text_length
|
| 114 |
+
- RecTVResize:
|
| 115 |
+
image_shape: [32, 128]
|
| 116 |
+
padding: False
|
| 117 |
+
- KeepKeys:
|
| 118 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 119 |
+
loader:
|
| 120 |
+
shuffle: False
|
| 121 |
+
drop_last: False
|
| 122 |
+
batch_size_per_card: 128
|
| 123 |
+
num_workers: 4
|
configs/rec/lpv/svtrv2_lpv.yml
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 20
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_lpv/
|
| 7 |
+
save_epoch_step: [15, 1]
|
| 8 |
+
# evaluation is run every 2000 iterations
|
| 9 |
+
eval_batch_step: [0, 500]
|
| 10 |
+
eval_epoch_step: [0, 1]
|
| 11 |
+
cal_metric_during_train: True
|
| 12 |
+
pretrained_model:
|
| 13 |
+
# ./output/rec/u14m_filter/svtrv2_lpv_wo_glrm/best.pth
|
| 14 |
+
checkpoints:
|
| 15 |
+
use_tensorboard: false
|
| 16 |
+
infer_img:
|
| 17 |
+
# for data or label process
|
| 18 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
| 19 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
| 20 |
+
max_text_length: &max_text_length 25
|
| 21 |
+
use_space_char: &use_space_char False
|
| 22 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_lpv.txt
|
| 23 |
+
use_amp: True
|
| 24 |
+
grad_clip_val: 20
|
| 25 |
+
|
| 26 |
+
Optimizer:
|
| 27 |
+
name: AdamW
|
| 28 |
+
lr: 0.000325 # for 4gpus bs128/gpu
|
| 29 |
+
weight_decay: 0.05
|
| 30 |
+
filter_bias_and_bn: True
|
| 31 |
+
|
| 32 |
+
LRScheduler:
|
| 33 |
+
name: OneCycleLR
|
| 34 |
+
warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
|
| 35 |
+
cycle_momentum: False
|
| 36 |
+
|
| 37 |
+
Architecture:
|
| 38 |
+
model_type: rec
|
| 39 |
+
algorithm: LPV
|
| 40 |
+
in_channels: 3
|
| 41 |
+
Transform:
|
| 42 |
+
Encoder:
|
| 43 |
+
name: SVTRv2LNConvTwo33
|
| 44 |
+
use_pos_embed: False
|
| 45 |
+
dims: [128, 256, 384]
|
| 46 |
+
depths: [6, 6, 6]
|
| 47 |
+
num_heads: [4, 8, 12]
|
| 48 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
| 49 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
| 50 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
| 51 |
+
last_stage: false
|
| 52 |
+
feat2d: True
|
| 53 |
+
Decoder:
|
| 54 |
+
name: LPVDecoder
|
| 55 |
+
num_layer: 3
|
| 56 |
+
max_len: *max_text_length
|
| 57 |
+
use_mask: True
|
| 58 |
+
dim_feedforward: 1536
|
| 59 |
+
nhead: 12
|
| 60 |
+
dropout: 0.1
|
| 61 |
+
trans_layer: 3
|
| 62 |
+
|
| 63 |
+
Loss:
|
| 64 |
+
name: LPVLoss
|
| 65 |
+
|
| 66 |
+
PostProcess:
|
| 67 |
+
name: ARLabelDecode
|
| 68 |
+
character_dict_path: *character_dict_path
|
| 69 |
+
use_space_char: *use_space_char
|
| 70 |
+
|
| 71 |
+
Metric:
|
| 72 |
+
name: RecMetric
|
| 73 |
+
main_indicator: acc
|
| 74 |
+
is_filter: True
|
| 75 |
+
|
| 76 |
+
Train:
|
| 77 |
+
dataset:
|
| 78 |
+
name: RatioDataSetTVResize
|
| 79 |
+
ds_width: True
|
| 80 |
+
padding: false
|
| 81 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
| 82 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
| 83 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
| 84 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
| 85 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
| 86 |
+
]
|
| 87 |
+
transforms:
|
| 88 |
+
- DecodeImagePIL: # load image
|
| 89 |
+
img_mode: RGB
|
| 90 |
+
- PARSeqAugPIL:
|
| 91 |
+
- ARLabelEncode: # Class handling label
|
| 92 |
+
character_dict_path: *character_dict_path
|
| 93 |
+
use_space_char: *use_space_char
|
| 94 |
+
max_text_length: *max_text_length
|
| 95 |
+
- KeepKeys:
|
| 96 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 97 |
+
sampler:
|
| 98 |
+
name: RatioSampler
|
| 99 |
+
scales: [[128, 32]] # w, h
|
| 100 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 101 |
+
first_bs: &bs 128
|
| 102 |
+
fix_bs: false
|
| 103 |
+
divided_factor: [4, 16] # w, h
|
| 104 |
+
is_training: True
|
| 105 |
+
loader:
|
| 106 |
+
shuffle: True
|
| 107 |
+
batch_size_per_card: *bs
|
| 108 |
+
drop_last: True
|
| 109 |
+
max_ratio: &max_ratio 4
|
| 110 |
+
num_workers: 4
|
| 111 |
+
|
| 112 |
+
Eval:
|
| 113 |
+
dataset:
|
| 114 |
+
name: RatioDataSetTVResize
|
| 115 |
+
ds_width: True
|
| 116 |
+
padding: False
|
| 117 |
+
data_dir_list: [
|
| 118 |
+
'../evaluation/CUTE80',
|
| 119 |
+
'../evaluation/IC13_857',
|
| 120 |
+
'../evaluation/IC15_1811',
|
| 121 |
+
'../evaluation/IIIT5k',
|
| 122 |
+
'../evaluation/SVT',
|
| 123 |
+
'../evaluation/SVTP',
|
| 124 |
+
]
|
| 125 |
+
transforms:
|
| 126 |
+
- DecodeImagePIL: # load image
|
| 127 |
+
img_mode: RGB
|
| 128 |
+
- ARLabelEncode: # Class handling label
|
| 129 |
+
character_dict_path: *character_dict_path
|
| 130 |
+
use_space_char: *use_space_char
|
| 131 |
+
max_text_length: *max_text_length
|
| 132 |
+
- KeepKeys:
|
| 133 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
| 134 |
+
sampler:
|
| 135 |
+
name: RatioSampler
|
| 136 |
+
scales: [[128, 32]] # w, h
|
| 137 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
| 138 |
+
first_bs: *bs
|
| 139 |
+
fix_bs: false
|
| 140 |
+
divided_factor: [4, 16] # w, h
|
| 141 |
+
is_training: False
|
| 142 |
+
loader:
|
| 143 |
+
shuffle: False
|
| 144 |
+
drop_last: False
|
| 145 |
+
batch_size_per_card: *bs
|
| 146 |
+
max_ratio: *max_ratio
|
| 147 |
+
num_workers: 4
|