topdu commited on
Commit
c898ed6
·
1 Parent(s): d5923fc
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +361 -0
  2. configs/dataset/rec/evaluation.yaml +41 -0
  3. configs/dataset/rec/ltb.yaml +9 -0
  4. configs/dataset/rec/mjsynth.yaml +11 -0
  5. configs/dataset/rec/openvino.yaml +25 -0
  6. configs/dataset/rec/ost.yaml +17 -0
  7. configs/dataset/rec/synthtext.yaml +7 -0
  8. configs/dataset/rec/test.yaml +77 -0
  9. configs/dataset/rec/textocr.yaml +13 -0
  10. configs/dataset/rec/textocr_horizontal.yaml +13 -0
  11. configs/dataset/rec/union14m_b.yaml +47 -0
  12. configs/dataset/rec/union14m_l_filtered.yaml +35 -0
  13. configs/det/dbnet/repvit_db.yml +171 -0
  14. configs/rec/abinet/resnet45_trans_abinet_lang.yml +94 -0
  15. configs/rec/abinet/resnet45_trans_abinet_wo_lang.yml +93 -0
  16. configs/rec/abinet/svtrv2_abinet_lang.yml +130 -0
  17. configs/rec/abinet/svtrv2_abinet_wo_lang.yml +128 -0
  18. configs/rec/aster/resnet31_lstm_aster_tps_on.yml +93 -0
  19. configs/rec/aster/svtrv2_aster.yml +127 -0
  20. configs/rec/aster/svtrv2_aster_tps_on.yml +102 -0
  21. configs/rec/autostr/autostr_lstm_aster_tps_on.yml +95 -0
  22. configs/rec/busnet/svtrv2_busnet.yml +135 -0
  23. configs/rec/busnet/svtrv2_busnet_pretraining.yml +134 -0
  24. configs/rec/busnet/vit_busnet.yml +104 -0
  25. configs/rec/busnet/vit_busnet_pretraining.yml +104 -0
  26. configs/rec/cam/convnextv2_cam_tps_on.yml +118 -0
  27. configs/rec/cam/convnextv2_tiny_cam_tps_on.yml +118 -0
  28. configs/rec/cam/svtrv2_cam_tps_on.yml +123 -0
  29. configs/rec/cdistnet/resnet45_trans_cdistnet.yml +93 -0
  30. configs/rec/cdistnet/svtrv2_cdistnet.yml +139 -0
  31. configs/rec/cppd/svtr_base_cppd.yml +123 -0
  32. configs/rec/cppd/svtr_base_cppd_ch.yml +126 -0
  33. configs/rec/cppd/svtr_base_cppd_h8.yml +123 -0
  34. configs/rec/cppd/svtr_base_cppd_syn.yml +124 -0
  35. configs/rec/cppd/svtrv2_cppd.yml +150 -0
  36. configs/rec/dan/resnet45_fpn_dan.yml +98 -0
  37. configs/rec/dan/svtrv2_dan.yml +130 -0
  38. configs/rec/dptr/dptr_parseq_pretrain.yml +88 -0
  39. configs/rec/focalsvtr/focalsvtr_ctc.yml +137 -0
  40. configs/rec/gtc/svtrv2_lnconv_nrtr_gtc.yml +168 -0
  41. configs/rec/gtc/svtrv2_lnconv_smtr_gtc_long_infer.yml +151 -0
  42. configs/rec/gtc/svtrv2_lnconv_smtr_gtc_smtr_long.yml +150 -0
  43. configs/rec/gtc/svtrv2_lnconv_smtr_gtc_stream.yml +152 -0
  44. configs/rec/igtr/readme.md +192 -0
  45. configs/rec/igtr/svtr_base_ds_igtr.yml +157 -0
  46. configs/rec/lister/focalsvtr_lister_wo_fem_maxratio12.yml +133 -0
  47. configs/rec/lister/svtrv2_lister_wo_fem_maxratio12.yml +138 -0
  48. configs/rec/lpv/svtr_base_lpv.yml +124 -0
  49. configs/rec/lpv/svtr_base_lpv_wo_glrm.yml +123 -0
  50. configs/rec/lpv/svtrv2_lpv.yml +147 -0
app.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import shutil
4
+ import re
5
+ import base64
6
+ import gradio as gr
7
+ from PIL import Image
8
+
9
+ from tools.infer_doc import OpenDoc
10
+ from tools.utils.logging import get_logger
11
+
12
+ logger = get_logger(name='opendoc_gradio')
13
+
14
+ # Initialize the pipeline
15
+ pipeline: OpenDoc | None = None
16
+
17
+
18
+ def get_pipeline(gpu_id: int) -> OpenDoc:
19
+ """获取或初始化OpenDoc流水线
20
+
21
+ Args:
22
+ gpu_id: GPU设备ID,-1表示使用CPU
23
+
24
+ Returns:
25
+ OpenDoc: 初始化好的OpenDoc实例
26
+ """
27
+ global pipeline
28
+ if pipeline is None:
29
+ logger.info(
30
+ f"Initializing OpenDoc pipeline on {'GPU ' + str(gpu_id) if gpu_id >= 0 else 'CPU'}..."
31
+ )
32
+ pipeline = OpenDoc(gpuId=gpu_id)
33
+ return pipeline
34
+
35
+
36
+ # Ensure pipeline is initialized
37
+ try:
38
+ current_pipeline = get_pipeline(0)
39
+ except Exception as e:
40
+ raise e
41
+
42
+
43
+ def process_image(image_path: str | None) -> tuple[Image.Image | None, str, str, str | None, str, str]:
44
+ """处理图片并进行OCR识别
45
+
46
+ Args:
47
+ image_path: 图片文件路径,None表示无图片
48
+
49
+ Returns:
50
+ tuple: (可视化图片, Markdown内容(base64图片), JSON内容, ZIP文件路径, 原始Markdown, Markdown内容(base64图片))
51
+ """
52
+ if image_path is None:
53
+ return None, '', '', None, '', ''
54
+
55
+ # Get original image name
56
+ base_name = os.path.splitext(os.path.basename(image_path))[0]
57
+ file_ext = os.path.splitext(image_path)[1] or '.jpg'
58
+
59
+ # Create a directory with image name for this request
60
+ output_base_dir = 'gradio_outputs'
61
+ os.makedirs(output_base_dir, exist_ok=True)
62
+
63
+ # Add timestamp to avoid conflicts if same filename is uploaded multiple times
64
+ timestamp = str(uuid.uuid4())[:8]
65
+ folder_name = f"{base_name}_{timestamp}"
66
+ tmp_dir = os.path.join(output_base_dir, folder_name)
67
+ os.makedirs(tmp_dir, exist_ok=True)
68
+
69
+ try:
70
+ # Copy and rename the input image
71
+ tmp_img_path = os.path.join(tmp_dir, f'{base_name}{file_ext}')
72
+ image = Image.open(image_path)
73
+ image.save(tmp_img_path)
74
+
75
+ # Predict
76
+ output = list(
77
+ current_pipeline.predict(tmp_img_path,
78
+ use_doc_orientation_classify=False,
79
+ use_doc_unwarping=False))
80
+ if not output:
81
+ return None, 'No results found.', '', None, '', ''
82
+
83
+ res = output[0]
84
+
85
+ # Save results
86
+ res.save_to_img(tmp_dir)
87
+ res.save_to_markdown(tmp_dir, pretty=True)
88
+ res.save_to_json(tmp_dir)
89
+
90
+ # Find the saved files
91
+ vis_img = None
92
+ for f in os.listdir(tmp_dir):
93
+ if 'layout_order_res' in f:
94
+ vis_img_path = os.path.join(tmp_dir, f)
95
+ vis_img = Image.open(vis_img_path)
96
+ break
97
+
98
+ markdown_content = ''
99
+ md_file_path = None
100
+ for f in os.listdir(tmp_dir):
101
+ if f.endswith('.md'):
102
+ md_file_path = os.path.join(tmp_dir, f)
103
+ with open(md_file_path, 'r', encoding='utf-8') as file:
104
+ markdown_content = file.read()
105
+ break
106
+
107
+ # Convert relative image paths to base64 for proper display in Gradio
108
+ if markdown_content:
109
+
110
+ def replace_img_with_base64(match):
111
+ img_path = match.group(1)
112
+ full_img_path = os.path.join(tmp_dir, img_path)
113
+
114
+ if os.path.exists(full_img_path):
115
+ try:
116
+ with open(full_img_path, 'rb') as img_file:
117
+ img_data = base64.b64encode(img_file.read()).decode('utf-8')
118
+ # Determine image format
119
+ ext = os.path.splitext(full_img_path)[1].lower()
120
+ mime_type = 'image/jpeg' if ext in ['.jpg', '.jpeg'] else 'image/png'
121
+ # Replace src with base64 data URL
122
+ return match.group(0).replace(f'src="{img_path}"', f'src="data:{mime_type};base64,{img_data}"')
123
+ except Exception as e:
124
+ logger.warning(f'Failed to convert image {img_path} to base64: {e}')
125
+ return match.group(0)
126
+
127
+ # Find all img tags and replace their src
128
+ markdown_content_show = re.sub(r'<img[^>]*src="([^"]+)"[^>]*>', replace_img_with_base64, markdown_content)
129
+ else:
130
+ markdown_content_show = markdown_content
131
+
132
+ json_content = ''
133
+ json_file_path = None
134
+ for f in os.listdir(tmp_dir):
135
+ if f.endswith('.json'):
136
+ json_file_path = os.path.join(tmp_dir, f)
137
+ with open(json_file_path, 'r', encoding='utf-8') as file:
138
+ json_content = file.read()
139
+ break
140
+
141
+ # Prepare all files in tmp_dir for download by creating a zip archive
142
+ zip_path = os.path.join(output_base_dir, f'{folder_name}.zip')
143
+ _ = shutil.make_archive(zip_path.replace('.zip', ''), 'zip', tmp_dir)
144
+
145
+ return vis_img, markdown_content_show, json_content, zip_path, markdown_content, markdown_content_show
146
+
147
+ except Exception as e:
148
+ logger.error(f'Prediction error: {str(e)}')
149
+ return None, f'Error during prediction: {str(e)}', '', None, '', ''
150
+
151
+
152
+ # Custom CSS with adaptive colors
153
+ custom_css = """
154
+ body, .gradio-container {
155
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Noto Sans", Helvetica, Arial, sans-serif;
156
+ }
157
+ .app-header {
158
+ text-align: center;
159
+ max-width: 1200px;
160
+ margin: 20px auto !important;
161
+ padding: 20px;
162
+ }
163
+ .app-header h1 {
164
+ font-size: 2.5em;
165
+ font-weight: 700;
166
+ margin-bottom: 10px;
167
+ }
168
+ .app-header p {
169
+ font-size: 1.1em;
170
+ opacity: 0.7;
171
+ line-height: 1.6;
172
+ }
173
+ .quick-links {
174
+ text-align: center;
175
+ padding: 12px 0;
176
+ border: 1px solid var(--border-color-primary);
177
+ border-radius: 12px;
178
+ margin: 16px auto;
179
+ max-width: 1200px;
180
+ background: var(--background-fill-secondary);
181
+ }
182
+ .quick-links a {
183
+ margin: 0 16px;
184
+ font-size: 15px;
185
+ font-weight: 600;
186
+ color: var(--link-text-color);
187
+ text-decoration: none;
188
+ transition: all 0.3s ease;
189
+ }
190
+ .quick-links a:hover {
191
+ opacity: 0.8;
192
+ text-decoration: underline;
193
+ }
194
+ .upload-section {
195
+ border: 2px dashed var(--border-color-primary);
196
+ border-radius: 12px;
197
+ padding: 20px;
198
+ background: var(--background-fill-secondary);
199
+ transition: all 0.3s ease;
200
+ }
201
+ .upload-section:hover {
202
+ border-color: var(--color-accent);
203
+ background: var(--background-fill-primary);
204
+ }
205
+ #vis_output {
206
+ min-height: 400px;
207
+ border-radius: 12px;
208
+ overflow: hidden;
209
+ }
210
+ #md_preview {
211
+ max-height: 600px;
212
+ min-height: 200px;
213
+ overflow: auto;
214
+ padding: 20px;
215
+ background: var(--background-fill-primary);
216
+ border-radius: 12px;
217
+ box-shadow: var(--shadow-drop);
218
+ }
219
+ #md_preview img {
220
+ display: block;
221
+ margin: 16px auto;
222
+ max-width: 100%;
223
+ height: auto;
224
+ border-radius: 8px;
225
+ }
226
+ .notice {
227
+ margin: 20px auto;
228
+ max-width: 1200px;
229
+ padding: 16px 20px;
230
+ border-left: 4px solid var(--color-accent);
231
+ border-radius: 8px;
232
+ background: var(--background-fill-secondary);
233
+ font-size: 14px;
234
+ line-height: 1.8;
235
+ }
236
+ .notice strong {
237
+ font-weight: 700;
238
+ color: var(--color-accent);
239
+ }
240
+ .notice ul {
241
+ margin-top: 8px;
242
+ padding-left: 20px;
243
+ }
244
+ .notice li {
245
+ margin: 8px 0;
246
+ }
247
+ .gradio-button-primary {
248
+ font-weight: 600 !important;
249
+ transition: all 0.3s ease !important;
250
+ }
251
+ .gradio-button-primary:hover {
252
+ transform: translateY(-2px);
253
+ box-shadow: var(--shadow-drop-lg) !important;
254
+ }
255
+ """
256
+
257
+ # LaTeX delimiters for formula rendering
258
+ LATEX_DELIMS = [
259
+ {"left": "$$", "right": "$$", "display": True},
260
+ {"left": "$", "right": "$", "display": False},
261
+ {"left": "\\(", "right": "\\)", "display": False},
262
+ {"left": "\\[", "right": "\\]", "display": True},
263
+ ]
264
+
265
+
266
+ # Define the Gradio Interface
267
+ def create_demo() -> gr.Blocks:
268
+ """创建Gradio演示界面
269
+
270
+ Returns:
271
+ gr.Blocks: Gradio Blocks应用实例
272
+ """
273
+ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title='OpenDoc-0.1B Demo') as demo:
274
+ # Header
275
+ gr.HTML("""
276
+ <div class="app-header">
277
+ <h1>🚀 OpenDoc-0.1B</h1>
278
+ <p>Ultra-Lightweight Document Parsing System with 0.1B Parameters (built by <a href="https://github.com/Topdu/OpenOCR">OCR Team</a>, <a href="https://fvl.fudan.edu.cn">FVL Lab</a>)</p>
279
+ <p style="font-size: 0.95em; color: #888;">
280
+ Powered by <a href="https://www.paddleocr.ai/latest/version3.x/module_usage/layout_analysis.html" target="_blank">PP-DocLayoutV2</a> for layout analysis and <a href="https://arxiv.org/pdf/2512.21095" target="_blank">UniRec-0.1B</a> for unified recognition of text, formulas, and tables
281
+ </p>
282
+ </div>
283
+ """)
284
+
285
+ # Quick links
286
+ gr.HTML("""
287
+ <div class="quick-links">
288
+ <a href="https://github.com/Topdu/OpenOCR" target="_blank">📖 GitHub</a>
289
+ <a href="https://arxiv.org/pdf/2512.21095" target="_blank">📄 Paper</a>
290
+ <a href="https://huggingface.co/topdu/unirec-0.1b" target="_blank">🤗 Model</a>
291
+ </div>
292
+ """)
293
+
294
+ with gr.Row():
295
+ with gr.Column(scale=5, elem_classes=["upload-section"]):
296
+ input_img = gr.Image(type='filepath', label='📤 Upload Document Image', height=400)
297
+
298
+ gr.Markdown("""
299
+ ### 💡 Tips
300
+ - Supports Chinese and English documents
301
+ - Best for reports, papers, magazines, and complex layouts
302
+ - Handles text, formulas, tables, and images
303
+ """)
304
+
305
+ btn = gr.Button('🔍 Analyze Document', variant='primary', size='lg')
306
+ download_output = gr.File(label='📥 Download All Results (ZIP)', visible=True)
307
+
308
+ with gr.Column(scale=7):
309
+ with gr.Tabs():
310
+ with gr.Tab('📝 Markdown Preview'):
311
+ output_md = gr.Markdown(
312
+ 'Please upload an image and click "Analyze Document" to see results.',
313
+ latex_delimiters=LATEX_DELIMS,
314
+ elem_id='md_preview'
315
+ )
316
+ with gr.Tab('📊 Layout Visualization'):
317
+ output_vis = gr.Image(type='pil', label='Layout Analysis Results', elem_id='vis_output')
318
+
319
+ with gr.Tab('📄 Raw Markdown'):
320
+ output_md_raw = gr.Code(
321
+ label='Markdown Source',
322
+ language='markdown',
323
+ lines=20
324
+ )
325
+ with gr.Tab('📄 Raw Markdown with Base64 Images'):
326
+ output_md_raw_with_base64 = gr.Code(
327
+ label='Markdown Source',
328
+ language='markdown',
329
+ lines=20
330
+ )
331
+
332
+ with gr.Tab('🗂️ JSON Result'):
333
+ output_json = gr.Code(label='Structured Data', language='json')
334
+
335
+ # Feature notice
336
+ gr.HTML("""
337
+ <div class="notice">
338
+ <strong>✨ Key Features:</strong>
339
+ <ul>
340
+ <li><strong>Ultra-lightweight:</strong> Only 0.1B parameters, fast inference speed</li>
341
+ <li><strong>High accuracy:</strong> Achieves 90.57% on OmniDocBench (v1.5)</li>
342
+ <li><strong>Unified recognition:</strong> Handles text, formulas, and tables in one model</li>
343
+ <li><strong>Rich output:</strong> Provides Markdown, JSON, and visualization results</li>
344
+ </ul>
345
+ </div>
346
+ """)
347
+
348
+ btn.click(
349
+ fn=process_image,
350
+ inputs=[input_img],
351
+ outputs=[output_vis, output_md, output_json, download_output, output_md_raw, output_md_raw_with_base64]
352
+ )
353
+
354
+ return demo
355
+
356
+
357
+ if __name__ == '__main__':
358
+ demo = create_demo()
359
+ demo.queue(max_size=20).launch(
360
+ share=False
361
+ )
configs/dataset/rec/evaluation.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ root: ../evaluation
2
+ task: str
3
+ download_links:
4
+ # IC15_1811
5
+ - https://drive.usercontent.google.com/download?id=1eGY0kXNV1qVxeUpoGzs-ioUO-ky7msH6&authuser=0&confirm=t
6
+ - https://drive.usercontent.google.com/download?id=1BWv7aLoLAT7avY326gXP3GJF48UZpuBC&authuser=0&confirm=t
7
+ # SVT
8
+ - https://drive.usercontent.google.com/download?id=1ecEZ4cJ7dIbTCZRltE0s5KzUotQWagH-&authuser=0&confirm=t
9
+ - https://drive.usercontent.google.com/download?id=1OygBP7i9R-3Pwi6WodCcW31J8CUMugOJ&authuser=0&confirm=t
10
+ # IIIT5k
11
+ - https://drive.usercontent.google.com/download?id=1PJ9_IvIGZTS5hHdGLnpKuYKZcCO8jE0E&authuser=0&confirm=t
12
+ - https://drive.usercontent.google.com/download?id=10P3MixSBt1v8k8_6aFfziC33Z5IlM6Uf&authuser=0&confirm=t
13
+ # IC13_857
14
+ - https://drive.usercontent.google.com/download?id=1-wMHOFBXJaOaY-UD00nDn6qw2s_8R4Vd&authuser=0&confirm=t
15
+ - https://drive.usercontent.google.com/download?id=1J1QCFtOFxFKiLJIgTqZ6eRo9Y5QGqHpA&authuser=0&confirm=t
16
+ # SVTP
17
+ - https://drive.usercontent.google.com/download?id=1kckwfZkdaHG8k_FW5IIJKUaYZkF21Hza&authuser=0&confirm=t
18
+ - https://drive.usercontent.google.com/download?id=1x61lm_ea7lvIdxNPMG-jy-5W0MxtdH0N&authuser=0&confirm=t
19
+ # CUTE80
20
+ - https://drive.usercontent.google.com/download?id=1Zv_91c81tinLy5Je89HPr-5wUSnqXKIB&authuser=0&confirm=t
21
+ - https://drive.usercontent.google.com/download?id=1OuJ6QoJ9AlyNHIM9j2WedAPxTnac7kyY&authuser=0&confirm=t
22
+ filenames:
23
+ # IC15_1811
24
+ - ../evaluation/IC15_1811/data.mdb
25
+ - ../evaluation/IC15_1811/lock.mdb
26
+ # SVT
27
+ - ../evaluation/SVT/data.mdb
28
+ - ../evaluation/SVT/lock.mdb
29
+ # IIIT5k
30
+ - ../evaluation/IIIT5k/data.mdb
31
+ - ../evaluation/IIIT5k/lock.mdb
32
+ # IC13_857
33
+ - ../evaluation/IC13_857/data.mdb
34
+ - ../evaluation/IC13_857/lock.mdb
35
+ # SVTP
36
+ - ../evaluation/SVTP/data.mdb
37
+ - ../evaluation/SVTP/lock.mdb
38
+ # CUTE80
39
+ - ../evaluation/CUTE80/data.mdb
40
+ - ../evaluation/CUTE80/lock.mdb
41
+ check_validity: true
configs/dataset/rec/ltb.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ root: ../ltb
2
+ task: str
3
+ download_links:
4
+ - https://drive.usercontent.google.com/download?id=16AEA1YGTsyVB44uEjKi4ZUV1snjCYBr4&authuser=0&confirm=t
5
+ - https://drive.usercontent.google.com/download?id=1xU4OStrOaI23bPG4flWAPWn2YrQe2bmY&authuser=0&confirm=t
6
+ filenames:
7
+ - ../ltb/data.mdb
8
+ - ../ltb/lock.mdb
9
+ check_validity: true
configs/dataset/rec/mjsynth.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ root: ../synth
2
+ task: str
3
+ download_links:
4
+ - https://drive.usercontent.google.com/download?id=1FIoplSFZ-BKQoRDHDXsVMKa844e-K8PD&authuser=0&confirm=t
5
+ - https://drive.usercontent.google.com/download?id=1eckTvaeRtlTZvbO2orrVz-cIuIk6i87K&authuser=0&confirm=t
6
+ - https://drive.usercontent.google.com/download?id=1PBXTf-2PnmEvJBsqzJqxxRwzhAZGTiMG&authuser=0&confirm=t
7
+ filenames:
8
+ - ../synth/MJ_train.zip
9
+ - ../synth/MJ_val.zip
10
+ - ../synth/MJ_test.zip
11
+ check_validity: true
configs/dataset/rec/openvino.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ root: ../OpenVINO
2
+ task: str
3
+ download_links:
4
+ # train_1
5
+ - https://drive.usercontent.google.com/download?id=1q23QAIRTyG0t-bBm4aAwRwiqB6VUfphw&authuser=0&confirm=
6
+ # train_2
7
+ - https://drive.usercontent.google.com/download?id=1AtbaJljM68cbZqi5lcM92d9VkQUCbSqI&authuser=0&confirm=
8
+ # train_5
9
+ - https://drive.usercontent.google.com/download?id=1dejstYnJ8_sESuO_uvwi__jT1B8gPxf3&authuser=0&confirm=t
10
+ # train_f
11
+ - https://drive.usercontent.google.com/download?id=1C4akchTc7-yi1OS_sJ3KP693UKcnecke&authuser=0&confirm=t
12
+ # validation
13
+ - https://drive.usercontent.google.com/download?id=17TRzSQhuK_juAxAv3KmX0y13pQP2cz6R&authuser=0&confirm=t
14
+ filenames:
15
+ # train_1
16
+ - ../OpenVINO/train_1.zip
17
+ # train_2
18
+ - ../OpenVINO/train_2.zip
19
+ # train_5
20
+ - ../OpenVINO/train_5.zip
21
+ # train_f
22
+ - ../OpenVINO/train_f.zip
23
+ # validation
24
+ - ../OpenVINO/validation.zip
25
+ check_validity: true
configs/dataset/rec/ost.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ root: ../OST
2
+ task: str
3
+ download_links:
4
+ # OST heavy
5
+ - https://drive.usercontent.google.com/download?id=1RGpIFbD_SRlrzZFBoVF_LGvetNx1-5pg&authuser=0&confirm=t
6
+ - https://drive.usercontent.google.com/download?id=1Th4MfDf44k0EBpIqCLqVoGRu6G-FP1hq&authuser=0&confirm=t
7
+ # OST weak
8
+ - https://drive.usercontent.google.com/download?id=1z5CTDJucUnvALG12Q4UXk1DDKJDd8WJn&authuser=0&confirm=t
9
+ - https://drive.usercontent.google.com/download?id=1V17TTkX3sjpV7v0km_F2SDCK0tL3k_ls&authuser=0&confirm=t
10
+ filenames:
11
+ # OST heavy
12
+ - ../OST/heavy/data.mdb
13
+ - ../OST/heavy/lock.mdb
14
+ # OST weak
15
+ - ../OST/weak/data.mdb
16
+ - ../OST/weak/lock.mdb
17
+ check_validity: true
configs/dataset/rec/synthtext.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ root: ../synth
2
+ task: str
3
+ download_links:
4
+ - https://drive.usercontent.google.com/download?id=1T-enqkq6_l2HqrsV3da_h0oJ7CUKu_oc&authuser=0&confirm=t
5
+ filenames:
6
+ - ../synth/ST.zip
7
+ check_validity: true
configs/dataset/rec/test.yaml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ root: ../test
2
+ task: str
3
+ download_links:
4
+ # IC13_857
5
+ - https://drive.usercontent.google.com/download?id=1PZSCbe6_DI8MlCqCRWXGT2PP92_frIXq&authuser=0&confirm=t
6
+ - https://drive.usercontent.google.com/download?id=1qkN7NDg0zUHxUiZHAeEatDTqlsgpFWp3&authuser=0&confirm=t
7
+ # IC15_2077
8
+ - https://drive.usercontent.google.com/download?id=1dFkY3DNbr-Mepn3TWBiA9COEJ63fGFcp&authuser=0&confirm=t
9
+ - https://drive.usercontent.google.com/download?id=1UvVwLNZ3tS1YdTBa8MulPzjeVezKaDro&authuser=0&confirm=t
10
+ # SVTP
11
+ - https://drive.usercontent.google.com/download?id=1aofeerilxJ7J3S7QxuCEXbmXTpz8Xshx&authuser=0&confirm=t
12
+ - https://drive.usercontent.google.com/download?id=1rJ1KoO4K_VUxEAUN_bMgBGzK8_JZAAno&authuser=0&confirm=t
13
+ # IIIT5k
14
+ - https://drive.usercontent.google.com/download?id=1XFO2M1Kbgwv3-iTNTmhQXAEjNmKYOeoT&authuser=0&confirm=t
15
+ - https://drive.usercontent.google.com/download?id=1stwK2hFsyaV7HHsEG9EYgnUQebNb2_nG&authuser=0&confirm=t
16
+ # COCOv1.4
17
+ - https://drive.usercontent.google.com/download?id=1Se2QSGS19xx7Gfy-SUdX9mlAOr2eYsfA&authuser=0&confirm=t
18
+ - https://drive.usercontent.google.com/download?id=1xvekFi389QfkH7yS0JIVV0QzjhUspjDv&authuser=0&confirm=t
19
+ # IC15_1811
20
+ - https://drive.usercontent.google.com/download?id=1pHsw8wrThD9EGEE6AusQLZozefSj4iyR&authuser=0&confirm=t
21
+ - https://drive.usercontent.google.com/download?id=1TXZ1qHuKAksaAlvd3qMv4IHKnN-IJW9a&authuser=0&confirm=t
22
+ # Uber
23
+ - https://drive.usercontent.google.com/download?id=1L2j6BZeLTGQ1FIl8HB_D3AFiWLltGV5r&authuser=0&confirm=t
24
+ - https://drive.usercontent.google.com/download?id=12DUj28yzLWxFO_gfMfSjTkRujYD5MNEE&authuser=0&confirm=t
25
+ # IC13_1095
26
+ - https://drive.usercontent.google.com/download?id=1fu8onMt3Z6fDLNAiHcm-sQ2qCXduE-FU&authuser=0&confirm=t
27
+ - https://drive.usercontent.google.com/download?id=1OQAZtLj8U2Cl4L0ErGFsz6vGIVTTWasD&authuser=0&confirm=t
28
+ # IC13_1015
29
+ - https://drive.usercontent.google.com/download?id=1mbsfuvWB282HYfn9tbqcj1nUDkLXcSNB&authuser=0&confirm=t
30
+ - https://drive.usercontent.google.com/download?id=1QGogU_hV-oN7iY2POutdD2LDcmK6plnV&authuser=0&confirm=t
31
+ # ArT
32
+ - https://drive.usercontent.google.com/download?id=1-53knSy-uTSngCG7wyBngVyTuTCmdnWl&authuser=0&confirm=t
33
+ - https://drive.usercontent.google.com/download?id=172EsSaf7BVaB1ORtohi-Jc_8SuUKZGGf&authuser=0&confirm=t
34
+ # SVT
35
+ - https://drive.usercontent.google.com/download?id=1p7aVUr9Yr7c4X4YUBvk2-YP28rraHjn9&authuser=0&confirm=t
36
+ - https://drive.usercontent.google.com/download?id=1ALmhvSleZ0yf-lcdbQPP3M9Zc3oqnXij&authuser=0&confirm=t
37
+ # CUTE80
38
+ - https://drive.usercontent.google.com/download?id=1Ujr4axHKnu54P2rIGUhkjdM6XlhDYrI_&authuser=0&confirm=t
39
+ - https://drive.usercontent.google.com/download?id=1DvZi9L3MqjO2zRUyCg3YvP4qMAt2bsme&authuser=0&confirm=t
40
+ filenames:
41
+ # IC13_857
42
+ - ../test/IC13_857/data.mdb
43
+ - ../test/IC13_857/lock.mdb
44
+ # IC15_2077
45
+ - ../test/IC15_2077/data.mdb
46
+ - ../test/IC15_2077/lock.mdb
47
+ # SVTP
48
+ - ../test/SVTP/data.mdb
49
+ - ../test/SVTP/lock.mdb
50
+ # IIIT5k
51
+ - ../test/IIIT5k/data.mdb
52
+ - ../test/IIIT5k/lock.mdb
53
+ # COCOv1.4
54
+ - ../test/COCOv1.4/data.mdb
55
+ - ../test/COCOv1.4/lock.mdb
56
+ # IC15_1811
57
+ - ../test/IC15_1811/data.mdb
58
+ - ../test/IC15_1811/lock.mdb
59
+ # Uber
60
+ - ../test/Uber/data.mdb
61
+ - ../test/Uber/lock.mdb
62
+ # IC13_1095
63
+ - ../test/IC13_1095/data.mdb
64
+ - ../test/IC13_1095/lock.mdb
65
+ # IC13_1015
66
+ - ../test/IC13_1015/data.mdb
67
+ - ../test/IC13_1015/lock.mdb
68
+ # ArT
69
+ - ../test/ArT/data.mdb
70
+ - ../test/ArT/lock.mdb
71
+ # SVT
72
+ - ../test/SVT/data.mdb
73
+ - ../test/SVT/lock.mdb
74
+ # CUTE80
75
+ - ../test/CUTE80/data.mdb
76
+ - ../test/CUTE80/lock.mdb
77
+ check_validity: true
configs/dataset/rec/textocr.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ root: ../TextOCR
2
+ task: str
3
+ download_links:
4
+ # train
5
+ - https://drive.usercontent.google.com/download?id=1jVjJFno4pnsU0Cp_kn4MIXQrChmELy92&authuser=0&confirm=
6
+ # val
7
+ - https://drive.usercontent.google.com/download?id=1ubIRu01MXIek6OvInu-XjaIbw6277-vw&authuser=0&confirm=t
8
+ filenames:
9
+ # train
10
+ - ../TextOCR/train.zip
11
+ # val
12
+ - ../TextOCR/val.zip
13
+ check_validity: true
configs/dataset/rec/textocr_horizontal.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ root: ../TextOCR_horizontal
2
+ task: str
3
+ download_links:
4
+ # train
5
+ - https://drive.usercontent.google.com/download?id=1sWH6J11xbjQb8SH7fdG_8mIKVI81ZQy5&authuser=0&confirm=
6
+ # val
7
+ - https://drive.usercontent.google.com/download?id=1gIE-AU2o-5hvg288-bjphO6UkI5AEQ2d&authuser=0&confirm=t
8
+ filenames:
9
+ # train
10
+ - ../TextOCR_horizontal/train.zip
11
+ # val
12
+ - ../TextOCR_horizontal/val.zip
13
+ check_validity: true
configs/dataset/rec/union14m_b.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ root: ../u14m
2
+ task: str
3
+ download_links:
4
+ # artistic
5
+ - https://drive.usercontent.google.com/download?id=1Je2DTuFHnkXDI99yDnm9Anl5naWaCQwd&authuser=0&confirm=t
6
+ - https://drive.usercontent.google.com/download?id=1xtT_Q0juBJUIvAG55qBxoVNNTECd2usZ&authuser=0&confirm=t
7
+ # contextless
8
+ - https://drive.usercontent.google.com/download?id=1_0OzyzWhZOmGrHkayFTVrzhrQrNRDRPR&authuser=0&confirm=t
9
+ - https://drive.usercontent.google.com/download?id=1PPgC42y3xoM9bR0HQFbDYbcT3PzMdD_y&authuser=0&confirm=t
10
+ # salient
11
+ - https://drive.usercontent.google.com/download?id=1tHLMYBmTqRnxvFOTT3dfLfQiundqFWfd&authuser=0&confirm=t
12
+ - https://drive.usercontent.google.com/download?id=13NQgpAtCK0kh9M5E2pAUmKKEp6Qu5Xwj&authuser=0&confirm=t
13
+ # multi_words
14
+ - https://drive.usercontent.google.com/download?id=1IlnDKX3V_Vp9gsDGFB0xoqsVLH1vtxUI&authuser=0&confirm=t
15
+ - https://drive.usercontent.google.com/download?id=1mFFjC7C0CwevvkwFU9YeVbZBdps_3Qpb&authuser=0&confirm=t
16
+ # curve
17
+ - https://drive.usercontent.google.com/download?id=1MxhMd85cmhUtI2lmtXhZQuFk7lav0_fw&authuser=0&confirm=t
18
+ - https://drive.usercontent.google.com/download?id=1N03g-4e-kJG2mRvlM0c5TrwWAkd-iG-Q&authuser=0&confirm=t
19
+ # general
20
+ - https://drive.usercontent.google.com/download?id=1Oqt7OaycP466NWoDmoJ3FqS8YP3YRgvu&authuser=0&confirm=t
21
+ - https://drive.usercontent.google.com/download?id=1K0MrX5eYNt8IIGFHXCwg0_oI5OF5PPFO&authuser=0&confirm=t
22
+ # multi_oriented
23
+ - https://drive.usercontent.google.com/download?id=1TKZFcZPVk0ThqfF-AGhJk_OCLg0ykKbv&authuser=0&confirm=t
24
+ - https://drive.usercontent.google.com/download?id=1PAoLMUWuR7O2-7XRoKkNzQcSiznErQzD&authuser=0&confirm=t
25
+ filenames:
26
+ # artistic
27
+ - ../u14m/artistic/data.mdb
28
+ - ../u14m/artistic/lock.mdb
29
+ # contextless
30
+ - ../u14m/contextless/data.mdb
31
+ - ../u14m/contextless/lock.mdb
32
+ # salient
33
+ - ../u14m/salient/data.mdb
34
+ - ../u14m/salient/lock.mdb
35
+ # multi_words
36
+ - ../u14m/multi_words/data.mdb
37
+ - ../u14m/multi_words/lock.mdb
38
+ # curve
39
+ - ../u14m/curve/data.mdb
40
+ - ../u14m/curve/lock.mdb
41
+ # general
42
+ - ../u14m/general/data.mdb
43
+ - ../u14m/general/lock.mdb
44
+ # multi_oriented
45
+ - ../u14m/multi_oriented/data.mdb
46
+ - ../u14m/multi_oriented/lock.mdb
47
+ check_validity: true
configs/dataset/rec/union14m_l_filtered.yaml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ root: ../Union14M-L-LMDB-Filtered
2
+ task: str
3
+ download_links:
4
+ # train_challenging
5
+ - https://drive.usercontent.google.com/download?id=1etwzBgGHjsFsb0sygsaRnKbanW2PMe07&authuser=0&confirm=t
6
+ - https://drive.usercontent.google.com/download?id=1ly6FJfPjItwGlVQ-ifTrzzM3rVu3Ezhr&authuser=0&confirm=t
7
+ # train_easy
8
+ - https://drive.usercontent.google.com/download?id=1_zeNluTnywIaa5h3PN-Ah9tKyByypot7&authuser=0&confirm=t
9
+ - https://drive.usercontent.google.com/download?id=1caYLeQHDidXgVBDi9IWXbO1gg__DYq9a&authuser=0&confirm=t
10
+ # train_hard
11
+ - https://drive.usercontent.google.com/download?id=1eP6s2xyYPZX9gykvWA4VSOc3Fqul_UB_&authuser=0&confirm=t
12
+ - https://drive.usercontent.google.com/download?id=1-ZlCvocX8P5uVRclUXp_5DNGLDzd16EO&authuser=0&confirm=t
13
+ # train_medium
14
+ - https://drive.usercontent.google.com/download?id=1s_CoaLNJEr-UxHYiqZ5jOcliMCFiRUUy&authuser=0&confirm=t
15
+ - https://drive.usercontent.google.com/download?id=1Wpj6WVpZ5Ily77kVwfQ18CiZBzkgmEnF&authuser=0&confirm=t
16
+ # train_normal
17
+ - https://drive.usercontent.google.com/download?id=1jPt44arlAswl9cXZjzmVcdpptdTPpJ3I&authuser=0&confirm=t
18
+ - https://drive.usercontent.google.com/download?id=1Rfc5kE03AzOUv7B_eYcBhUV8KMQ2MZ1m&authuser=0&confirm=t
19
+ filenames:
20
+ # train_challenging
21
+ - ../Union14M-L-LMDB-Filtered/train_challenging/data.mdb
22
+ - ../Union14M-L-LMDB-Filtered/train_challenging/lock.mdb
23
+ # train_easy
24
+ - ../Union14M-L-LMDB-Filtered/train_easy/data.mdb
25
+ - ../Union14M-L-LMDB-Filtered/train_easy/lock.mdb
26
+ # train_hard
27
+ - ../Union14M-L-LMDB-Filtered/train_hard/data.mdb
28
+ - ../Union14M-L-LMDB-Filtered/train_hard/lock.mdb
29
+ # train_medium
30
+ - ../Union14M-L-LMDB-Filtered/train_medium/data.mdb
31
+ - ../Union14M-L-LMDB-Filtered/train_medium/lock.mdb
32
+ # train_normal
33
+ - ../Union14M-L-LMDB-Filtered/train_normal/data.mdb
34
+ - ../Union14M-L-LMDB-Filtered/train_normal/lock.mdb
35
+ check_validity: true
configs/det/dbnet/repvit_db.yml ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: &epoch_num 500
4
+ log_smooth_window: 20
5
+ print_batch_step: 100
6
+ output_dir: ./output/det_repsvtr_db
7
+ save_epoch_step: [400, 25]
8
+ eval_batch_step:
9
+ - 0
10
+ - 1000
11
+ cal_metric_during_train: false
12
+ checkpoints:
13
+ pretrained_model: openocr_det_repvit_ch.pth
14
+ save_inference_dir: null
15
+ use_tensorboard: false
16
+ infer_img:
17
+ save_res_path: ./checkpoints/det_db/predicts_db.txt
18
+ distributed: true
19
+ model_type: det
20
+
21
+ Architecture:
22
+ algorithm: DB_mobile
23
+ Backbone:
24
+ name: RepSVTR_det
25
+ Neck:
26
+ name: RSEFPN
27
+ out_channels: 96
28
+ shortcut: True
29
+ Head:
30
+ name: DBHead
31
+ k: 50
32
+
33
+ Loss:
34
+ name: DBLoss
35
+ balance_loss: true
36
+ main_loss_type: DiceLoss
37
+ alpha: 5
38
+ beta: 10
39
+ ohem_ratio: 3
40
+
41
+ Optimizer:
42
+ name: Adam
43
+ lr: 0.001
44
+ weight_decay: 5.0e-05
45
+ filter_bias_and_bn: False
46
+
47
+ LRScheduler:
48
+ name: CosineAnnealingLR
49
+ warmup_epoch: 2
50
+
51
+ PostProcess:
52
+ name: DBPostProcess
53
+ thresh: 0.3
54
+ box_thresh: 0.6
55
+ max_candidates: 1000
56
+ unclip_ratio: 1.5
57
+ score_mode: 'slow'
58
+
59
+ Metric:
60
+ name: DetMetric
61
+ main_indicator: hmean
62
+
63
+ Train:
64
+ dataset:
65
+ name: SimpleDataSet
66
+ data_dir: ../icdar2015/text_localization/
67
+ label_file_list:
68
+ - ../icdar2015/text_localization/train_icdar2015_label.txt
69
+ ratio_list: [1.0]
70
+ transforms:
71
+ - DecodeImage:
72
+ img_mode: BGR
73
+ channel_first: false
74
+ - DetLabelEncode: null
75
+ - CopyPaste: null
76
+ - IaaAugment:
77
+ augmenter_args:
78
+ - type: Fliplr
79
+ args:
80
+ p: 0.5
81
+ - type: Affine
82
+ args:
83
+ rotate:
84
+ - -10
85
+ - 10
86
+ - type: Resize
87
+ args:
88
+ size:
89
+ - 0.5
90
+ - 3
91
+ - EastRandomCropData:
92
+ size:
93
+ - 640
94
+ - 640
95
+ max_tries: 50
96
+ keep_ratio: true
97
+ - MakeBorderMap:
98
+ shrink_ratio: 0.4
99
+ thresh_min: 0.3
100
+ thresh_max: 0.7
101
+ total_epoch: *epoch_num
102
+ - MakeShrinkMap:
103
+ shrink_ratio: 0.4
104
+ min_text_size: 8
105
+ total_epoch: *epoch_num
106
+ - NormalizeImage:
107
+ scale: 1./255.
108
+ mean:
109
+ - 0.485
110
+ - 0.456
111
+ - 0.406
112
+ std:
113
+ - 0.229
114
+ - 0.224
115
+ - 0.225
116
+ order: hwc
117
+ - ToCHWImage: null
118
+ - KeepKeys:
119
+ keep_keys:
120
+ - image
121
+ - threshold_map
122
+ - threshold_mask
123
+ - shrink_map
124
+ - shrink_mask
125
+ loader:
126
+ shuffle: true
127
+ drop_last: false
128
+ batch_size_per_card: 8
129
+ num_workers: 8
130
+
131
+ Eval:
132
+ dataset:
133
+ name: SimpleDataSet
134
+ data_dir: ../icdar2015/text_localization/
135
+ label_file_list:
136
+ - ../icdar2015/text_localization/test_icdar2015_label.txt
137
+ transforms:
138
+ - DecodeImage:
139
+ img_mode: BGR
140
+ channel_first: false
141
+ - DetLabelEncode: null
142
+ - DetResizeForTest:
143
+ # image_shape: [1280, 1280]
144
+ # keep_ratio: True
145
+ # padding: True
146
+ limit_side_len: 960
147
+ limit_type: max
148
+ - NormalizeImage:
149
+ scale: 1./255.
150
+ mean:
151
+ - 0.485
152
+ - 0.456
153
+ - 0.406
154
+ std:
155
+ - 0.229
156
+ - 0.224
157
+ - 0.225
158
+ order: hwc
159
+ - ToCHWImage: null
160
+ - KeepKeys:
161
+ keep_keys:
162
+ - image
163
+ - shape
164
+ - polys
165
+ - ignore_tags
166
+ loader:
167
+ shuffle: false
168
+ drop_last: false
169
+ batch_size_per_card: 1
170
+ num_workers: 2
171
+ profiler_options: null
configs/rec/abinet/resnet45_trans_abinet_lang.yml ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/resnet45_trans_abinet_lang/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ # ./openocr_nolang_abinet_lang.pth
12
+ checkpoints:
13
+ use_tensorboard: false
14
+ infer_img:
15
+ # for data or label process
16
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
17
+ max_text_length: 25
18
+ use_space_char: False
19
+ save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_abinet_lang.txt
20
+ grad_clip_val: 20
21
+ use_amp: True
22
+
23
+ Optimizer:
24
+ name: Adam
25
+ lr: 0.000267
26
+ weight_decay: 0.0
27
+ filter_bias_and_bn: False
28
+
29
+ LRScheduler:
30
+ name: MultiStepLR
31
+ milestones: [12]
32
+ gamma: 0.1
33
+
34
+ Architecture:
35
+ model_type: rec
36
+ algorithm: ABINet
37
+ Transform:
38
+ Encoder:
39
+ name: ResNet45
40
+ in_channels: 3
41
+ strides: [2, 1, 2, 1, 1]
42
+ Decoder:
43
+ name: ABINetDecoder
44
+ iter_size: 3
45
+
46
+ Loss:
47
+ name: ABINetLoss
48
+
49
+ PostProcess:
50
+ name: ABINetLabelDecode
51
+
52
+ Metric:
53
+ name: RecMetric
54
+ main_indicator: acc
55
+ is_filter: True
56
+
57
+ Train:
58
+ dataset:
59
+ name: LMDBDataSet
60
+ data_dir: ../Union14M-L-LMDB-Filtered
61
+ transforms:
62
+ - DecodeImagePIL: # load image
63
+ img_mode: RGB
64
+ - PARSeqAugPIL:
65
+ - ABINetLabelEncode:
66
+ - RecTVResize:
67
+ image_shape: [32, 128]
68
+ padding: False
69
+ - KeepKeys:
70
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
71
+ loader:
72
+ shuffle: True
73
+ batch_size_per_card: 256
74
+ drop_last: True
75
+ num_workers: 4
76
+
77
+ Eval:
78
+ dataset:
79
+ name: LMDBDataSet
80
+ data_dir: ../evaluation
81
+ transforms:
82
+ - DecodeImagePIL: # load image
83
+ img_mode: RGB
84
+ - ABINetLabelEncode:
85
+ - RecTVResize:
86
+ image_shape: [32, 128]
87
+ padding: False
88
+ - KeepKeys:
89
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
90
+ loader:
91
+ shuffle: False
92
+ drop_last: False
93
+ batch_size_per_card: 256
94
+ num_workers: 2
configs/rec/abinet/resnet45_trans_abinet_wo_lang.yml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/resnet45_trans_abinet_wo_lang/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_abinet_wo_lang.txt
19
+ grad_clip_val: 20
20
+ use_amp: True
21
+
22
+ Optimizer:
23
+ name: Adam
24
+ lr: 0.000267
25
+ weight_decay: 0.0
26
+ filter_bias_and_bn: False
27
+
28
+ LRScheduler:
29
+ name: MultiStepLR
30
+ milestones: [12]
31
+ gamma: 0.1
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: ABINet
36
+ Transform:
37
+ Encoder:
38
+ name: ResNet45
39
+ in_channels: 3
40
+ strides: [2, 1, 2, 1, 1]
41
+ Decoder:
42
+ name: ABINetDecoder
43
+ iter_size: 0
44
+
45
+ Loss:
46
+ name: ABINetLoss
47
+
48
+ PostProcess:
49
+ name: ABINetLabelDecode
50
+
51
+ Metric:
52
+ name: RecMetric
53
+ main_indicator: acc
54
+ is_filter: True
55
+
56
+ Train:
57
+ dataset:
58
+ name: LMDBDataSet
59
+ data_dir: ../Union14M-L-LMDB-Filtered
60
+ transforms:
61
+ - DecodeImagePIL: # load image
62
+ img_mode: RGB
63
+ - PARSeqAugPIL:
64
+ - ABINetLabelEncode:
65
+ - RecTVResize:
66
+ image_shape: [32, 128]
67
+ padding: False
68
+ - KeepKeys:
69
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
70
+ loader:
71
+ shuffle: True
72
+ batch_size_per_card: 256
73
+ drop_last: True
74
+ num_workers: 4
75
+
76
+ Eval:
77
+ dataset:
78
+ name: LMDBDataSet
79
+ data_dir: ../evaluation
80
+ transforms:
81
+ - DecodeImagePIL: # load image
82
+ img_mode: RGB
83
+ - ABINetLabelEncode:
84
+ - RecTVResize:
85
+ image_shape: [32, 128]
86
+ padding: False
87
+ - KeepKeys:
88
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
89
+ loader:
90
+ shuffle: False
91
+ drop_last: False
92
+ batch_size_per_card: 256
93
+ num_workers: 2
configs/rec/abinet/svtrv2_abinet_lang.yml ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_abinet_lang/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ # ./openocr_svtrv2_nolang_abinet_lang.pth
12
+ checkpoints:
13
+ use_tensorboard: false
14
+ infer_img:
15
+ # for data or label process
16
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
17
+ max_text_length: 25
18
+ use_space_char: False
19
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_abinet_lang.txt
20
+ use_amp: True
21
+ grad_clip_val: 20
22
+
23
+ Optimizer:
24
+ name: AdamW
25
+ lr: 0.00065 # for 4gpus bs256/gpu
26
+ weight_decay: 0.05
27
+ filter_bias_and_bn: True
28
+
29
+ LRScheduler:
30
+ name: OneCycleLR
31
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
32
+ cycle_momentum: False
33
+
34
+ Architecture:
35
+ model_type: rec
36
+ algorithm: ABINet
37
+ Transform:
38
+ Encoder:
39
+ name: SVTRv2LNConvTwo33
40
+ use_pos_embed: False
41
+ dims: [128, 256, 384]
42
+ depths: [6, 6, 6]
43
+ num_heads: [4, 8, 12]
44
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
45
+ local_k: [[5, 5], [5, 5], [-1, -1]]
46
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
47
+ last_stage: false
48
+ feat2d: True
49
+ Decoder:
50
+ name: ABINetDecoder
51
+ iter_size: 3
52
+ num_layers: 0
53
+
54
+ Loss:
55
+ name: ABINetLoss
56
+
57
+ PostProcess:
58
+ name: ABINetLabelDecode
59
+
60
+ Metric:
61
+ name: RecMetric
62
+ main_indicator: acc
63
+ is_filter: True
64
+
65
+ Train:
66
+ dataset:
67
+ name: RatioDataSetTVResize
68
+ ds_width: True
69
+ padding: false
70
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
71
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
72
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
73
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
74
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
75
+ ]
76
+ transforms:
77
+ - DecodeImagePIL: # load image
78
+ img_mode: RGB
79
+ - PARSeqAugPIL:
80
+ - ABINetLabelEncode:
81
+ - KeepKeys:
82
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
83
+ sampler:
84
+ name: RatioSampler
85
+ scales: [[128, 32]] # w, h
86
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
87
+ first_bs: &bs 256
88
+ fix_bs: false
89
+ divided_factor: [4, 16] # w, h
90
+ is_training: True
91
+ loader:
92
+ shuffle: True
93
+ batch_size_per_card: *bs
94
+ drop_last: True
95
+ max_ratio: &max_ratio 4
96
+ num_workers: 4
97
+
98
+ Eval:
99
+ dataset:
100
+ name: RatioDataSetTVResize
101
+ ds_width: True
102
+ padding: False
103
+ data_dir_list: [
104
+ '../evaluation/CUTE80',
105
+ '../evaluation/IC13_857',
106
+ '../evaluation/IC15_1811',
107
+ '../evaluation/IIIT5k',
108
+ '../evaluation/SVT',
109
+ '../evaluation/SVTP',
110
+ ]
111
+ transforms:
112
+ - DecodeImagePIL: # load image
113
+ img_mode: RGB
114
+ - ABINetLabelEncode:
115
+ - KeepKeys:
116
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
117
+ sampler:
118
+ name: RatioSampler
119
+ scales: [[128, 32]] # w, h
120
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
121
+ first_bs: *bs
122
+ fix_bs: false
123
+ divided_factor: [4, 16] # w, h
124
+ is_training: False
125
+ loader:
126
+ shuffle: False
127
+ drop_last: False
128
+ batch_size_per_card: *bs
129
+ max_ratio: *max_ratio
130
+ num_workers: 4
configs/rec/abinet/svtrv2_abinet_wo_lang.yml ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_abinet_wo_lang/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_abinet_wo_lang.txt
19
+ use_amp: True
20
+ grad_clip_val: 20
21
+
22
+ Optimizer:
23
+ name: AdamW
24
+ lr: 0.00065 # for 4gpus bs256/gpu
25
+ weight_decay: 0.05
26
+ filter_bias_and_bn: True
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: ABINet
36
+ Transform:
37
+ Encoder:
38
+ name: SVTRv2LNConvTwo33
39
+ use_pos_embed: False
40
+ dims: [128, 256, 384]
41
+ depths: [6, 6, 6]
42
+ num_heads: [4, 8, 12]
43
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
44
+ local_k: [[5, 5], [5, 5], [-1, -1]]
45
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
46
+ last_stage: false
47
+ feat2d: True
48
+ Decoder:
49
+ name: ABINetDecoder
50
+ iter_size: 0
51
+ num_layers: 0
52
+ Loss:
53
+ name: ABINetLoss
54
+
55
+ PostProcess:
56
+ name: ABINetLabelDecode
57
+
58
+ Metric:
59
+ name: RecMetric
60
+ main_indicator: acc
61
+ is_filter: True
62
+
63
+ Train:
64
+ dataset:
65
+ name: RatioDataSetTVResize
66
+ ds_width: True
67
+ padding: false
68
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
69
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
70
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
71
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
72
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
73
+ ]
74
+ transforms:
75
+ - DecodeImagePIL: # load image
76
+ img_mode: RGB
77
+ - PARSeqAugPIL:
78
+ - ABINetLabelEncode:
79
+ - KeepKeys:
80
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
81
+ sampler:
82
+ name: RatioSampler
83
+ scales: [[128, 32]] # w, h
84
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
85
+ first_bs: &bs 256
86
+ fix_bs: false
87
+ divided_factor: [4, 16] # w, h
88
+ is_training: True
89
+ loader:
90
+ shuffle: True
91
+ batch_size_per_card: *bs
92
+ drop_last: True
93
+ max_ratio: &max_ratio 4
94
+ num_workers: 4
95
+
96
+ Eval:
97
+ dataset:
98
+ name: RatioDataSetTVResize
99
+ ds_width: True
100
+ padding: False
101
+ data_dir_list: [
102
+ '../evaluation/CUTE80',
103
+ '../evaluation/IC13_857',
104
+ '../evaluation/IC15_1811',
105
+ '../evaluation/IIIT5k',
106
+ '../evaluation/SVT',
107
+ '../evaluation/SVTP',
108
+ ]
109
+ transforms:
110
+ - DecodeImagePIL: # load image
111
+ img_mode: RGB
112
+ - ABINetLabelEncode:
113
+ - KeepKeys:
114
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
115
+ sampler:
116
+ name: RatioSampler
117
+ scales: [[128, 32]] # w, h
118
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
119
+ first_bs: *bs
120
+ fix_bs: false
121
+ divided_factor: [4, 16] # w, h
122
+ is_training: False
123
+ loader:
124
+ shuffle: False
125
+ drop_last: False
126
+ batch_size_per_card: *bs
127
+ max_ratio: *max_ratio
128
+ num_workers: 4
configs/rec/aster/resnet31_lstm_aster_tps_on.yml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/resnet31_lstm_aster_tps_on
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/predicts_aster_tps.txt
19
+ use_amp: True
20
+ grad_clip_val: 1.0
21
+
22
+ Optimizer:
23
+ name: Adam
24
+ lr: 0.002 # for 1gpus bs1024/gpu
25
+ weight_decay: 0.0
26
+ filter_bias_and_bn: False
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: aster
36
+ Transform:
37
+ name: Aster_TPS
38
+ tps_inputsize: [32, 64]
39
+ tps_outputsize: [32, 128]
40
+ Encoder:
41
+ name: ResNet_ASTER
42
+ Decoder:
43
+ name: ASTERDecoder
44
+
45
+ Loss:
46
+ name: ARLoss
47
+
48
+ Metric:
49
+ name: RecMetric
50
+ main_indicator: acc
51
+ is_filter: True
52
+
53
+ PostProcess:
54
+ name: ARLabelDecode
55
+
56
+ Train:
57
+ dataset:
58
+ name: LMDBDataSet
59
+ data_dir: ../Union14M-L-LMDB-Filtered
60
+ transforms:
61
+ - DecodeImagePIL: # load image
62
+ img_mode: RGB
63
+ - PARSeqAugPIL:
64
+ - ARLabelEncode: # Class handling label
65
+ - RecTVResize:
66
+ image_shape: [64, 256]
67
+ padding: False
68
+ - KeepKeys:
69
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
70
+ loader:
71
+ shuffle: True
72
+ batch_size_per_card: 1024
73
+ drop_last: True
74
+ num_workers: 4
75
+
76
+ Eval:
77
+ dataset:
78
+ name: LMDBDataSet
79
+ data_dir: ../evaluation
80
+ transforms:
81
+ - DecodeImagePIL: # load image
82
+ img_mode: RGB
83
+ - ARLabelEncode: # Class handling label
84
+ - RecTVResize:
85
+ image_shape: [64, 256]
86
+ padding: False
87
+ - KeepKeys:
88
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
89
+ loader:
90
+ shuffle: False
91
+ drop_last: False
92
+ batch_size_per_card: 256
93
+ num_workers: 2
configs/rec/aster/svtrv2_aster.yml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_aster
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_aster.txt
19
+ use_amp: True
20
+
21
+ Optimizer:
22
+ name: AdamW
23
+ lr: 0.00065 # for 4gpus bs256/gpu
24
+ weight_decay: 0.05
25
+ filter_bias_and_bn: True
26
+
27
+ LRScheduler:
28
+ name: OneCycleLR
29
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
30
+ cycle_momentum: False
31
+
32
+ Architecture:
33
+ model_type: rec
34
+ algorithm: aster
35
+ Transform:
36
+ Encoder:
37
+ name: SVTRv2LNConvTwo33
38
+ use_pos_embed: False
39
+ out_channels: 256
40
+ dims: [128, 256, 384]
41
+ depths: [6, 6, 6]
42
+ num_heads: [4, 8, 12]
43
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
44
+ local_k: [[5, 5], [5, 5], [-1, -1]]
45
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
46
+ last_stage: false
47
+ feat2d: False
48
+ Decoder:
49
+ name: ASTERDecoder
50
+
51
+ Loss:
52
+ name: ARLoss
53
+
54
+ Metric:
55
+ name: RecMetric
56
+ main_indicator: acc
57
+ is_filter: True
58
+
59
+ PostProcess:
60
+ name: ARLabelDecode
61
+
62
+ Train:
63
+ dataset:
64
+ name: RatioDataSetTVResize
65
+ ds_width: True
66
+ padding: false
67
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
68
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
69
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
70
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
71
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
72
+ ]
73
+ transforms:
74
+ - DecodeImagePIL: # load image
75
+ img_mode: RGB
76
+ - PARSeqAugPIL:
77
+ - ARLabelEncode: # Class handling label
78
+ - KeepKeys:
79
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
80
+ sampler:
81
+ name: RatioSampler
82
+ scales: [[128, 32]] # w, h
83
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
84
+ first_bs: &bs 256
85
+ fix_bs: false
86
+ divided_factor: [4, 16] # w, h
87
+ is_training: True
88
+ loader:
89
+ shuffle: True
90
+ batch_size_per_card: *bs
91
+ drop_last: True
92
+ max_ratio: &max_ratio 4
93
+ num_workers: 4
94
+
95
+ Eval:
96
+ dataset:
97
+ name: RatioDataSetTVResize
98
+ ds_width: True
99
+ padding: False
100
+ data_dir_list: [
101
+ '../evaluation/CUTE80',
102
+ '../evaluation/IC13_857',
103
+ '../evaluation/IC15_1811',
104
+ '../evaluation/IIIT5k',
105
+ '../evaluation/SVT',
106
+ '../evaluation/SVTP',
107
+ ]
108
+ transforms:
109
+ - DecodeImagePIL: # load image
110
+ img_mode: RGB
111
+ - ARLabelEncode: # Class handling label
112
+ - KeepKeys:
113
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
114
+ sampler:
115
+ name: RatioSampler
116
+ scales: [[128, 32]] # w, h
117
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
118
+ first_bs: *bs
119
+ fix_bs: false
120
+ divided_factor: [4, 16] # w, h
121
+ is_training: False
122
+ loader:
123
+ shuffle: False
124
+ drop_last: False
125
+ batch_size_per_card: *bs
126
+ max_ratio: *max_ratio
127
+ num_workers: 4
configs/rec/aster/svtrv2_aster_tps_on.yml ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_aster_tps_on
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_aster_tps_on.txt
19
+ use_amp: True
20
+
21
+ Optimizer:
22
+ name: AdamW
23
+ lr: 0.00065 # for 4gpus bs256/gpu
24
+ weight_decay: 0.05
25
+ filter_bias_and_bn: True
26
+
27
+ LRScheduler:
28
+ name: OneCycleLR
29
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
30
+ cycle_momentum: False
31
+
32
+ Architecture:
33
+ model_type: rec
34
+ algorithm: aster
35
+ Transform:
36
+ name: Aster_TPS
37
+ tps_inputsize: [32, 64]
38
+ tps_outputsize: [32, 128]
39
+ Encoder:
40
+ name: SVTRv2LNConvTwo33
41
+ use_pos_embed: False
42
+ out_channels: 256
43
+ dims: [128, 256, 384]
44
+ depths: [6, 6, 6]
45
+ num_heads: [4, 8, 12]
46
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
47
+ local_k: [[5, 5], [5, 5], [-1, -1]]
48
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
49
+ last_stage: false
50
+ feat2d: False
51
+ Decoder:
52
+ name: ASTERDecoder
53
+
54
+ Loss:
55
+ name: ARLoss
56
+
57
+ Metric:
58
+ name: RecMetric
59
+ main_indicator: acc
60
+ is_filter: True
61
+
62
+ PostProcess:
63
+ name: ARLabelDecode
64
+
65
+ Train:
66
+ dataset:
67
+ name: LMDBDataSet
68
+ data_dir: ../Union14M-L-LMDB-Filtered
69
+ transforms:
70
+ - DecodeImagePIL: # load image
71
+ img_mode: RGB
72
+ - PARSeqAugPIL:
73
+ - ARLabelEncode: # Class handling label
74
+ - RecTVResize:
75
+ image_shape: [64, 256]
76
+ padding: False
77
+ - KeepKeys:
78
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
79
+ loader:
80
+ shuffle: True
81
+ batch_size_per_card: 256
82
+ drop_last: True
83
+ num_workers: 4
84
+
85
+ Eval:
86
+ dataset:
87
+ name: LMDBDataSet
88
+ data_dir: ../evaluation
89
+ transforms:
90
+ - DecodeImagePIL: # load image
91
+ img_mode: RGB
92
+ - ARLabelEncode: # Class handling label
93
+ - RecTVResize:
94
+ image_shape: [64, 256]
95
+ padding: False
96
+ - KeepKeys:
97
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
98
+ loader:
99
+ shuffle: False
100
+ drop_last: False
101
+ batch_size_per_card: 256
102
+ num_workers: 2
configs/rec/autostr/autostr_lstm_aster_tps_on.yml ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/autostr_lstm_aster_tps_on
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_autostr_lstm_aster_tps_on.txt
19
+ use_amp: True
20
+ grad_clip_val: 1.0
21
+
22
+ Optimizer:
23
+ name: Adam
24
+ lr: 0.002 # for 4gpus bs256/gpu
25
+ weight_decay: 0.0
26
+ filter_bias_and_bn: False
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: autostr
36
+ Transform:
37
+ name: Aster_TPS
38
+ tps_inputsize: [32, 64]
39
+ tps_outputsize: [32, 128]
40
+ Encoder:
41
+ name: AutoSTREncoder
42
+ stride_stages: '[(2, 2), (2, 1), (2, 2), (2, 1), (2, 1)]'
43
+ conv_op_ids: [2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 4, 1, 1, 6, 6]
44
+ Decoder:
45
+ name: ASTERDecoder
46
+
47
+ Loss:
48
+ name: ARLoss
49
+
50
+ Metric:
51
+ name: RecMetric
52
+ main_indicator: acc
53
+ is_filter: True
54
+
55
+ PostProcess:
56
+ name: ARLabelDecode
57
+
58
+ Train:
59
+ dataset:
60
+ name: LMDBDataSet
61
+ data_dir: ../Union14M-L-LMDB-Filtered
62
+ transforms:
63
+ - DecodeImagePIL: # load image
64
+ img_mode: RGB
65
+ - PARSeqAugPIL:
66
+ - ARLabelEncode: # Class handling label
67
+ - RecTVResize:
68
+ image_shape: [64, 256]
69
+ padding: False
70
+ - KeepKeys:
71
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
72
+ loader:
73
+ shuffle: True
74
+ batch_size_per_card: 256
75
+ drop_last: True
76
+ num_workers: 4
77
+
78
+ Eval:
79
+ dataset:
80
+ name: LMDBDataSet
81
+ data_dir: ../evaluation
82
+ transforms:
83
+ - DecodeImagePIL: # load image
84
+ img_mode: RGB
85
+ - ARLabelEncode: # Class handling label
86
+ - RecTVResize:
87
+ image_shape: [64, 256]
88
+ padding: False
89
+ - KeepKeys:
90
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
91
+ loader:
92
+ shuffle: False
93
+ drop_last: False
94
+ batch_size_per_card: 256
95
+ num_workers: 2
configs/rec/busnet/svtrv2_busnet.yml ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 10
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_busnet/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ # ./output/rec/u14m_filter/svtrv2_busnet_pretraining/best.pth
12
+ checkpoints:
13
+ use_tensorboard: false
14
+ infer_img:
15
+ # for data or label process
16
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
17
+ max_text_length: 25
18
+ use_space_char: False
19
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_busnet.txt
20
+ use_amp: True
21
+
22
+ Optimizer:
23
+ name: AdamW
24
+ lr: 0.00065 # 4gpus bs256/gpu
25
+ weight_decay: 0.05
26
+ filter_bias_and_bn: True
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: BUSBet
36
+ Transform:
37
+ Encoder:
38
+ name: SVTRv2LNConvTwo33
39
+ use_pos_embed: False
40
+ dims: [128, 256, 384]
41
+ depths: [6, 6, 6]
42
+ num_heads: [4, 8, 12]
43
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
44
+ local_k: [[5, 5], [5, 5], [-1, -1]]
45
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
46
+ last_stage: false
47
+ feat2d: False
48
+ Decoder:
49
+ name: BUSDecoder
50
+ nhead: 6
51
+ num_layers: 6
52
+ dim_feedforward: 1536
53
+ ignore_index: &ignore_index 100
54
+ pretraining: False
55
+ # return_id: 2
56
+ Loss:
57
+ name: ABINetLoss
58
+ ignore_index: *ignore_index
59
+
60
+ PostProcess:
61
+ name: ABINetLabelDecode
62
+
63
+ Metric:
64
+ name: RecMetric
65
+ main_indicator: acc
66
+ is_filter: True
67
+
68
+ Train:
69
+ dataset:
70
+ name: RatioDataSetTVResize
71
+ ds_width: True
72
+ padding: false
73
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
74
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
75
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
76
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
77
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
78
+ ]
79
+ transforms:
80
+ - DecodeImagePIL: # load image
81
+ img_mode: RGB
82
+ - PARSeqAugPIL:
83
+ - ABINetLabelEncode:
84
+ ignore_index: *ignore_index
85
+ - KeepKeys:
86
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
87
+ sampler:
88
+ name: RatioSampler
89
+ scales: [[128, 32]] # w, h
90
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
91
+ first_bs: &bs 256
92
+ fix_bs: false
93
+ divided_factor: [4, 16] # w, h
94
+ is_training: True
95
+ loader:
96
+ shuffle: True
97
+ batch_size_per_card: *bs
98
+ drop_last: True
99
+ max_ratio: &max_ratio 4
100
+ num_workers: 4
101
+
102
+ Eval:
103
+ dataset:
104
+ name: RatioDataSetTVResize
105
+ ds_width: True
106
+ padding: False
107
+ data_dir_list: [
108
+ '../evaluation/CUTE80',
109
+ '../evaluation/IC13_857',
110
+ '../evaluation/IC15_1811',
111
+ '../evaluation/IIIT5k',
112
+ '../evaluation/SVT',
113
+ '../evaluation/SVTP',
114
+ ]
115
+ transforms:
116
+ - DecodeImagePIL: # load image
117
+ img_mode: RGB
118
+ - ABINetLabelEncode:
119
+ ignore_index: *ignore_index
120
+ - KeepKeys:
121
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
122
+ sampler:
123
+ name: RatioSampler
124
+ scales: [[128, 32]] # w, h
125
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
126
+ first_bs: *bs
127
+ fix_bs: false
128
+ divided_factor: [4, 16] # w, h
129
+ is_training: False
130
+ loader:
131
+ shuffle: False
132
+ drop_last: False
133
+ batch_size_per_card: *bs
134
+ max_ratio: *max_ratio
135
+ num_workers: 4
configs/rec/busnet/svtrv2_busnet_pretraining.yml ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 10
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_busnet_pretraining/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_busnet_pretraining.txt
19
+ use_amp: True
20
+
21
+ Optimizer:
22
+ name: AdamW
23
+ lr: 0.00065 # 4gpus bs256/gpu
24
+ weight_decay: 0.05
25
+ filter_bias_and_bn: True
26
+
27
+ LRScheduler:
28
+ name: OneCycleLR
29
+ warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
30
+ cycle_momentum: False
31
+
32
+ Architecture:
33
+ model_type: rec
34
+ algorithm: BUSBet
35
+ Transform:
36
+ Encoder:
37
+ name: SVTRv2LNConvTwo33
38
+ use_pos_embed: False
39
+ dims: [128, 256, 384]
40
+ depths: [6, 6, 6]
41
+ num_heads: [4, 8, 12]
42
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
43
+ local_k: [[5, 5], [5, 5], [-1, -1]]
44
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
45
+ last_stage: false
46
+ feat2d: False
47
+ Decoder:
48
+ name: BUSDecoder
49
+ nhead: 6
50
+ num_layers: 6
51
+ dim_feedforward: 1536
52
+ ignore_index: &ignore_index 100
53
+ pretraining: True
54
+ # return_id: 0
55
+ Loss:
56
+ name: ABINetLoss
57
+ ignore_index: *ignore_index
58
+
59
+ PostProcess:
60
+ name: ABINetLabelDecode
61
+
62
+ Metric:
63
+ name: RecMetric
64
+ main_indicator: acc
65
+ is_filter: True
66
+
67
+ Train:
68
+ dataset:
69
+ name: RatioDataSetTVResize
70
+ ds_width: True
71
+ padding: false
72
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
73
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
74
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
75
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
76
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
77
+ ]
78
+ transforms:
79
+ - DecodeImagePIL: # load image
80
+ img_mode: RGB
81
+ - PARSeqAugPIL:
82
+ - ABINetLabelEncode:
83
+ ignore_index: *ignore_index
84
+ - KeepKeys:
85
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
86
+ sampler:
87
+ name: RatioSampler
88
+ scales: [[128, 32]] # w, h
89
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
90
+ first_bs: &bs 256
91
+ fix_bs: false
92
+ divided_factor: [4, 16] # w, h
93
+ is_training: True
94
+ loader:
95
+ shuffle: True
96
+ batch_size_per_card: *bs
97
+ drop_last: True
98
+ max_ratio: &max_ratio 4
99
+ num_workers: 4
100
+
101
+ Eval:
102
+ dataset:
103
+ name: RatioDataSetTVResize
104
+ ds_width: True
105
+ padding: False
106
+ data_dir_list: [
107
+ '../evaluation/CUTE80',
108
+ '../evaluation/IC13_857',
109
+ '../evaluation/IC15_1811',
110
+ '../evaluation/IIIT5k',
111
+ '../evaluation/SVT',
112
+ '../evaluation/SVTP',
113
+ ]
114
+ transforms:
115
+ - DecodeImagePIL: # load image
116
+ img_mode: RGB
117
+ - ABINetLabelEncode:
118
+ ignore_index: *ignore_index
119
+ - KeepKeys:
120
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
121
+ sampler:
122
+ name: RatioSampler
123
+ scales: [[128, 32]] # w, h
124
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
125
+ first_bs: *bs
126
+ fix_bs: false
127
+ divided_factor: [4, 16] # w, h
128
+ is_training: False
129
+ loader:
130
+ shuffle: False
131
+ drop_last: False
132
+ batch_size_per_card: *bs
133
+ max_ratio: *max_ratio
134
+ num_workers: 4
configs/rec/busnet/vit_busnet.yml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 10
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/vit_busnet/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_vit_busnet.txt
19
+ grad_clip_val: 20
20
+ use_amp: True
21
+
22
+ Optimizer:
23
+ name: Adam
24
+ lr: 0.00053 # 4gpus bs256/gpu
25
+ weight_decay: 0.0
26
+ filter_bias_and_bn: False
27
+
28
+ LRScheduler:
29
+ name: MultiStepLR
30
+ milestones: [6]
31
+ gamma: 0.1
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: BUSBet
36
+ Transform:
37
+ Encoder:
38
+ name: ViT
39
+ img_size: [32,128]
40
+ patch_size: [4, 8]
41
+ embed_dim: 384
42
+ depth: 12
43
+ num_heads: 6
44
+ mlp_ratio: 4
45
+ qkv_bias: True
46
+ Decoder:
47
+ name: BUSDecoder
48
+ nhead: 6
49
+ num_layers: 6
50
+ dim_feedforward: 1536
51
+ ignore_index: &ignore_index 100
52
+ pretraining: False
53
+ Loss:
54
+ name: ABINetLoss
55
+ ignore_index: *ignore_index
56
+
57
+ PostProcess:
58
+ name: ABINetLabelDecode
59
+
60
+ Metric:
61
+ name: RecMetric
62
+ main_indicator: acc
63
+ is_filter: True
64
+
65
+ Train:
66
+ dataset:
67
+ name: LMDBDataSet
68
+ data_dir: ../Union14M-L-LMDB-Filtered
69
+ transforms:
70
+ - DecodeImagePIL: # load image
71
+ img_mode: RGB
72
+ - PARSeqAugPIL:
73
+ - ABINetLabelEncode:
74
+ ignore_index: *ignore_index
75
+ - RecTVResize:
76
+ image_shape: [32, 128]
77
+ padding: False
78
+ - KeepKeys:
79
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
80
+ loader:
81
+ shuffle: True
82
+ batch_size_per_card: 256
83
+ drop_last: True
84
+ num_workers: 4
85
+
86
+ Eval:
87
+ dataset:
88
+ name: LMDBDataSet
89
+ data_dir: ../evaluation
90
+ transforms:
91
+ - DecodeImagePIL: # load image
92
+ img_mode: RGB
93
+ - ABINetLabelEncode:
94
+ ignore_index: *ignore_index
95
+ - RecTVResize:
96
+ image_shape: [32, 128]
97
+ padding: False
98
+ - KeepKeys:
99
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
100
+ loader:
101
+ shuffle: False
102
+ drop_last: False
103
+ batch_size_per_card: 256
104
+ num_workers: 2
configs/rec/busnet/vit_busnet_pretraining.yml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 10
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/vit_busnet_pretraining/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_vit_busnet_pretraining.txt
19
+ grad_clip_val: 20
20
+ use_amp: True
21
+
22
+ Optimizer:
23
+ name: Adam
24
+ lr: 0.00053 # 4gpus bs256/gpu
25
+ weight_decay: 0.0
26
+ filter_bias_and_bn: False
27
+
28
+ LRScheduler:
29
+ name: MultiStepLR
30
+ milestones: [6]
31
+ gamma: 0.1
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: BUSBet
36
+ Transform:
37
+ Encoder:
38
+ name: ViT
39
+ img_size: [32,128]
40
+ patch_size: [4, 8]
41
+ embed_dim: 384
42
+ depth: 12
43
+ num_heads: 6
44
+ mlp_ratio: 4
45
+ qkv_bias: True
46
+ Decoder:
47
+ name: BUSDecoder
48
+ nhead: 6
49
+ num_layers: 6
50
+ dim_feedforward: 1536
51
+ ignore_index: &ignore_index 100
52
+ pretraining: True
53
+ Loss:
54
+ name: ABINetLoss
55
+ ignore_index: *ignore_index
56
+
57
+ PostProcess:
58
+ name: ABINetLabelDecode
59
+
60
+ Metric:
61
+ name: RecMetric
62
+ main_indicator: acc
63
+ is_filter: True
64
+
65
+ Train:
66
+ dataset:
67
+ name: LMDBDataSet
68
+ data_dir: ../Union14M-L-LMDB-Filtered
69
+ transforms:
70
+ - DecodeImagePIL: # load image
71
+ img_mode: RGB
72
+ - PARSeqAugPIL:
73
+ - ABINetLabelEncode:
74
+ ignore_index: *ignore_index
75
+ - RecTVResize:
76
+ image_shape: [32, 128]
77
+ padding: False
78
+ - KeepKeys:
79
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
80
+ loader:
81
+ shuffle: True
82
+ batch_size_per_card: 256
83
+ drop_last: True
84
+ num_workers: 4
85
+
86
+ Eval:
87
+ dataset:
88
+ name: LMDBDataSet
89
+ data_dir: ../evaluation
90
+ transforms:
91
+ - DecodeImagePIL: # load image
92
+ img_mode: RGB
93
+ - ABINetLabelEncode:
94
+ ignore_index: *ignore_index
95
+ - RecTVResize:
96
+ image_shape: [32, 128]
97
+ padding: False
98
+ - KeepKeys:
99
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
100
+ loader:
101
+ shuffle: False
102
+ drop_last: False
103
+ batch_size_per_card: 256
104
+ num_workers: 2
configs/rec/cam/convnextv2_cam_tps_on.yml ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/convnextv2_cam_tps_on
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: False
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: &max_text_length 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_convnextv2_cam_tps_on.txt
19
+ use_amp: True
20
+
21
+ Optimizer:
22
+ name: AdamW
23
+ lr: 0.0008 # for 4gpus bs256/gpu
24
+ weight_decay: 0.05
25
+ filter_bias_and_bn: True
26
+ eps: 1.e-8
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 : 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: CAM
36
+ Transform:
37
+ name: Aster_TPS
38
+ tps_inputsize: [32, 64]
39
+ tps_outputsize: &img_shape [32, 128]
40
+ Encoder:
41
+ name: CAMEncoder
42
+ encoder_config:
43
+ name: ConvNeXtV2
44
+ depths: [2, 2, 8, 2]
45
+ dims: [80, 160, 320, 640]
46
+ strides: [[4,4], [2,1], [2,1], [1,1]]
47
+ drop_path_rate: 0.2
48
+ feat2d: True
49
+ nb_classes: 97
50
+ strides: [[4,4], [2,1], [2,1], [1,1]]
51
+ deform_stride: 2
52
+ stage_idx: 2
53
+ use_depthwise_unet: True
54
+ use_more_unet: False
55
+ binary_loss_type: BanlanceMultiClassCrossEntropyLoss
56
+ mid_size: True
57
+ d_embedding: 384
58
+ Decoder:
59
+ name: CAMDecoder
60
+ num_encoder_layers: -1
61
+ beam_size: 0
62
+ num_decoder_layers: 2
63
+ nhead: 8
64
+ max_len: *max_text_length
65
+
66
+ Loss:
67
+ name: CAMLoss
68
+ loss_weight_binary: 1.5
69
+ label_smoothing: 0.
70
+
71
+ Metric:
72
+ name: RecMetric
73
+ main_indicator: acc
74
+ is_filter: True
75
+
76
+ PostProcess:
77
+ name: ARLabelDecode
78
+
79
+ Train:
80
+ dataset:
81
+ name: LMDBDataSet
82
+ data_dir: ../Union14M-L-LMDB-Filtered
83
+ transforms:
84
+ - DecodeImagePIL: # load image
85
+ img_mode: RGB
86
+ - PARSeqAugPIL:
87
+ - CAMLabelEncode: # Class handling label
88
+ font_path: ./arial.ttf
89
+ image_shape: *img_shape
90
+ - RecTVResize:
91
+ image_shape: [64, 256]
92
+ padding: False
93
+ - KeepKeys:
94
+ keep_keys: ['image', 'label', 'length', 'binary_mask'] # dataloader will return list in this order
95
+ loader:
96
+ shuffle: True
97
+ batch_size_per_card: 256
98
+ drop_last: True
99
+ num_workers: 4
100
+
101
+ Eval:
102
+ dataset:
103
+ name: LMDBDataSet
104
+ data_dir: ../evaluation
105
+ transforms:
106
+ - DecodeImagePIL: # load image
107
+ img_mode: RGB
108
+ - ARLabelEncode: # Class handling label
109
+ - RecTVResize:
110
+ image_shape: [64, 256]
111
+ padding: False
112
+ - KeepKeys:
113
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
114
+ loader:
115
+ shuffle: False
116
+ drop_last: False
117
+ batch_size_per_card: 256
118
+ num_workers: 2
configs/rec/cam/convnextv2_tiny_cam_tps_on.yml ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/convnextv2_tiny_cam_tps_on
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: False
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: &max_text_length 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_convnextv2_cam_tps_on.txt
19
+ use_amp: True
20
+
21
+ Optimizer:
22
+ name: AdamW
23
+ lr: 0.0008 # for 4gpus bs256/gpu
24
+ weight_decay: 0.05
25
+ filter_bias_and_bn: True
26
+ eps: 1.e-8
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 : 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: CAM
36
+ Transform:
37
+ name: Aster_TPS
38
+ tps_inputsize: [32, 64]
39
+ tps_outputsize: &img_shape [32, 128]
40
+ Encoder:
41
+ name: CAMEncoder
42
+ encoder_config:
43
+ name: ConvNeXtV2
44
+ depths: [3, 3, 9, 3]
45
+ dims: [96, 192, 384, 768]
46
+ strides: [[4,4], [2,1], [2,1], [1,1]]
47
+ drop_path_rate: 0.2
48
+ feat2d: True
49
+ nb_classes: 97
50
+ strides: [[4,4], [2,1], [2,1], [1,1]]
51
+ deform_stride: 2
52
+ stage_idx: 2
53
+ use_depthwise_unet: True
54
+ use_more_unet: False
55
+ binary_loss_type: BanlanceMultiClassCrossEntropyLoss
56
+ mid_size: False
57
+ d_embedding: 512
58
+ Decoder:
59
+ name: CAMDecoder
60
+ num_encoder_layers: -1
61
+ beam_size: 0
62
+ num_decoder_layers: 2
63
+ nhead: 8
64
+ max_len: *max_text_length
65
+
66
+ Loss:
67
+ name: CAMLoss
68
+ loss_weight_binary: 1.5
69
+ label_smoothing: 0.
70
+
71
+ Metric:
72
+ name: RecMetric
73
+ main_indicator: acc
74
+ is_filter: True
75
+
76
+ PostProcess:
77
+ name: ARLabelDecode
78
+
79
+ Train:
80
+ dataset:
81
+ name: LMDBDataSet
82
+ data_dir: ../Union14M-L-LMDB-Filtered
83
+ transforms:
84
+ - DecodeImagePIL: # load image
85
+ img_mode: RGB
86
+ - PARSeqAugPIL:
87
+ - CAMLabelEncode: # Class handling label
88
+ font_path: ./arial.ttf
89
+ image_shape: *img_shape
90
+ - RecTVResize:
91
+ image_shape: [64, 256]
92
+ padding: False
93
+ - KeepKeys:
94
+ keep_keys: ['image', 'label', 'length', 'binary_mask'] # dataloader will return list in this order
95
+ loader:
96
+ shuffle: True
97
+ batch_size_per_card: 256
98
+ drop_last: True
99
+ num_workers: 4
100
+
101
+ Eval:
102
+ dataset:
103
+ name: LMDBDataSet
104
+ data_dir: ../evaluation
105
+ transforms:
106
+ - DecodeImagePIL: # load image
107
+ img_mode: RGB
108
+ - ARLabelEncode: # Class handling label
109
+ - RecTVResize:
110
+ image_shape: [64, 256]
111
+ padding: False
112
+ - KeepKeys:
113
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
114
+ loader:
115
+ shuffle: False
116
+ drop_last: False
117
+ batch_size_per_card: 256
118
+ num_workers: 2
configs/rec/cam/svtrv2_cam_tps_on.yml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_cam_tps_on
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: False
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: &max_text_length 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_cam_tps_on.txt
19
+ use_amp: True
20
+
21
+ Optimizer:
22
+ name: AdamW
23
+ lr: 0.00065 # for 4gpus bs256/gpu
24
+ weight_decay: 0.05
25
+ filter_bias_and_bn: True
26
+
27
+ LRScheduler:
28
+ name: OneCycleLR
29
+ warmup_epoch: 1.5 # pct_start 0.075*20 : 1.5ep
30
+ cycle_momentum: False
31
+
32
+ Architecture:
33
+ model_type: rec
34
+ algorithm: CAM
35
+ Transform:
36
+ name: Aster_TPS
37
+ tps_inputsize: [32, 64]
38
+ tps_outputsize: &img_shape [32, 128]
39
+ Encoder:
40
+ name: CAMEncoder
41
+ encoder_config:
42
+ name: SVTRv2LNConvTwo33
43
+ use_pos_embed: False
44
+ dims: [128, 256, 384]
45
+ depths: [6, 6, 6]
46
+ num_heads: [4, 8, 12]
47
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
48
+ local_k: [[5, 5], [5, 5], [-1, -1]]
49
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
50
+ last_stage: false
51
+ feat2d: True
52
+ nb_classes: 97
53
+ strides: [[4, 4], [1, 1], [2, 1], [1, 1]]
54
+ k_size: [[2, 2], [1, 1], [2, 1], [1, 1]]
55
+ q_size: [4, 32]
56
+ deform_stride: 2
57
+ stage_idx: 2
58
+ use_depthwise_unet: True
59
+ use_more_unet: False
60
+ binary_loss_type: BanlanceMultiClassCrossEntropyLoss
61
+ mid_size: True
62
+ d_embedding: 384
63
+ Decoder:
64
+ name: CAMDecoder
65
+ num_encoder_layers: -1
66
+ beam_size: 0
67
+ num_decoder_layers: 2
68
+ nhead: 8
69
+ max_len: *max_text_length
70
+
71
+ Loss:
72
+ name: CAMLoss
73
+ loss_weight_binary: 1.5
74
+ label_smoothing: 0.
75
+
76
+ Metric:
77
+ name: RecMetric
78
+ main_indicator: acc
79
+ is_filter: True
80
+
81
+ PostProcess:
82
+ name: ARLabelDecode
83
+
84
+ Train:
85
+ dataset:
86
+ name: LMDBDataSet
87
+ data_dir: ../Union14M-L-LMDB-Filtered
88
+ transforms:
89
+ - DecodeImagePIL: # load image
90
+ img_mode: RGB
91
+ - PARSeqAugPIL:
92
+ - CAMLabelEncode: # Class handling label
93
+ font_path: ./arial.ttf
94
+ image_shape: *img_shape
95
+ - RecTVResize:
96
+ image_shape: [64, 256]
97
+ padding: False
98
+ - KeepKeys:
99
+ keep_keys: ['image', 'label', 'length', 'binary_mask'] # dataloader will return list in this order
100
+ loader:
101
+ shuffle: True
102
+ batch_size_per_card: 256
103
+ drop_last: True
104
+ num_workers: 4
105
+
106
+ Eval:
107
+ dataset:
108
+ name: LMDBDataSet
109
+ data_dir: ../evaluation
110
+ transforms:
111
+ - DecodeImagePIL: # load image
112
+ img_mode: RGB
113
+ - ARLabelEncode: # Class handling label
114
+ - RecTVResize:
115
+ image_shape: [64, 256]
116
+ padding: False
117
+ - KeepKeys:
118
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
119
+ loader:
120
+ shuffle: False
121
+ drop_last: False
122
+ batch_size_per_card: 256
123
+ num_workers: 2
configs/rec/cdistnet/resnet45_trans_cdistnet.yml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/resnet45_trans_cdistnet
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_cdistnet.txt
19
+ use_amp: True
20
+ grad_clip_val: 5
21
+
22
+ Optimizer:
23
+ name: Adam
24
+ lr: 0.002 # for 4gpus bs256/gpu
25
+ weight_decay: 0.0
26
+ filter_bias_and_bn: False
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: CDistNet
36
+ Transform:
37
+ Encoder:
38
+ name: ResNet45
39
+ in_channels: 3
40
+ strides: [2, 1, 2, 1, 1]
41
+ Decoder:
42
+ name: CDistNetDecoder
43
+ add_conv: True
44
+
45
+ Loss:
46
+ name: ARLoss
47
+
48
+ PostProcess:
49
+ name: ARLabelDecode
50
+
51
+ Metric:
52
+ name: RecMetric
53
+ main_indicator: acc
54
+ is_filter: True
55
+
56
+ Train:
57
+ dataset:
58
+ name: LMDBDataSet
59
+ data_dir: ../Union14M-L-LMDB-Filtered
60
+ transforms:
61
+ - DecodeImagePIL: # load image
62
+ img_mode: RGB
63
+ - PARSeqAugPIL:
64
+ - ARLabelEncode: # Class handling label
65
+ - RecTVResize:
66
+ image_shape: [32, 128]
67
+ padding: False
68
+ - KeepKeys:
69
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
70
+ loader:
71
+ shuffle: True
72
+ batch_size_per_card: 256
73
+ drop_last: True
74
+ num_workers: 4
75
+
76
+ Eval:
77
+ dataset:
78
+ name: LMDBDataSet
79
+ data_dir: ../evaluation
80
+ transforms:
81
+ - DecodeImagePIL: # load image
82
+ img_mode: RGB
83
+ - ARLabelEncode: # Class handling label
84
+ - RecTVResize:
85
+ image_shape: [32, 128]
86
+ padding: False
87
+ - KeepKeys:
88
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
89
+ loader:
90
+ shuffle: False
91
+ drop_last: False
92
+ batch_size_per_card: 256
93
+ num_workers: 2
configs/rec/cdistnet/svtrv2_cdistnet.yml ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_cdistnet/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
16
+ # ./tools/utils/ppocr_keys_v1.txt # ch
17
+ max_text_length: &max_text_length 25
18
+ use_space_char: &use_space_char False
19
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_cdistnet.txt
20
+ use_amp: True
21
+
22
+ Optimizer:
23
+ name: AdamW
24
+ lr: 0.00065 #4gpus bs256/gpu
25
+ weight_decay: 0.05
26
+ filter_bias_and_bn: True
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: CDistNet
36
+ in_channels: 3
37
+ Transform:
38
+ Encoder:
39
+ name: SVTRv2LNConvTwo33
40
+ use_pos_embed: False
41
+ out_channels: 256
42
+ dims: [128, 256, 384]
43
+ depths: [6, 6, 6]
44
+ num_heads: [4, 8, 12]
45
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
46
+ local_k: [[5, 5], [5, 5], [-1, -1]]
47
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
48
+ last_stage: false
49
+ feat2d: True
50
+ Decoder:
51
+ name: CDistNetDecoder
52
+ add_conv: False
53
+ num_encoder_blocks: 0
54
+
55
+ Loss:
56
+ name: ARLoss
57
+
58
+ PostProcess:
59
+ name: ARLabelDecode
60
+ character_dict_path: *character_dict_path
61
+ use_space_char: *use_space_char
62
+
63
+ Metric:
64
+ name: RecMetric
65
+ main_indicator: acc
66
+ is_filter: True
67
+
68
+ Train:
69
+ dataset:
70
+ name: RatioDataSetTVResize
71
+ ds_width: True
72
+ padding: false
73
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
74
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
75
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
76
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
77
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
78
+ ]
79
+ transforms:
80
+ - DecodeImagePIL: # load image
81
+ img_mode: RGB
82
+ - PARSeqAugPIL:
83
+ - ARLabelEncode: # Class handling label
84
+ character_dict_path: *character_dict_path
85
+ use_space_char: *use_space_char
86
+ max_text_length: *max_text_length
87
+ - KeepKeys:
88
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
89
+ sampler:
90
+ name: RatioSampler
91
+ scales: [[128, 32]] # w, h
92
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
93
+ first_bs: &bs 256
94
+ fix_bs: false
95
+ divided_factor: [4, 16] # w, h
96
+ is_training: True
97
+ loader:
98
+ shuffle: True
99
+ batch_size_per_card: *bs
100
+ drop_last: True
101
+ max_ratio: &max_ratio 4
102
+ num_workers: 4
103
+
104
+ Eval:
105
+ dataset:
106
+ name: RatioDataSetTVResize
107
+ ds_width: True
108
+ padding: False
109
+ data_dir_list: [
110
+ '../evaluation/CUTE80',
111
+ '../evaluation/IC13_857',
112
+ '../evaluation/IC15_1811',
113
+ '../evaluation/IIIT5k',
114
+ '../evaluation/SVT',
115
+ '../evaluation/SVTP',
116
+ ]
117
+ transforms:
118
+ - DecodeImagePIL: # load image
119
+ img_mode: RGB
120
+ - ARLabelEncode: # Class handling label
121
+ character_dict_path: *character_dict_path
122
+ use_space_char: *use_space_char
123
+ max_text_length: *max_text_length
124
+ - KeepKeys:
125
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
126
+ sampler:
127
+ name: RatioSampler
128
+ scales: [[128, 32]] # w, h
129
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
130
+ first_bs: *bs
131
+ fix_bs: false
132
+ divided_factor: [4, 16] # w, h
133
+ is_training: False
134
+ loader:
135
+ shuffle: False
136
+ drop_last: False
137
+ batch_size_per_card: *bs
138
+ max_ratio: *max_ratio
139
+ num_workers: 4
configs/rec/cppd/svtr_base_cppd.yml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtr_base_cppd/
7
+ save_epoch_step: [15, 1]
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img:
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path
18
+ # ./tools/utils/EN_symbol_dict.txt # 96en
19
+ # ./tools/utils/ppocr_keys_v1.txt # ch
20
+ max_text_length: &max_text_length 25
21
+ use_space_char: &use_space_char False
22
+ save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_cppd.txt
23
+ use_amp: True
24
+
25
+ Optimizer:
26
+ name: AdamW
27
+ lr: 0.00065 # for 4gpus bs256/gpu
28
+ weight_decay: 0.05
29
+ filter_bias_and_bn: True
30
+
31
+ LRScheduler:
32
+ name: OneCycleLR
33
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
34
+ cycle_momentum: False
35
+
36
+ Architecture:
37
+ model_type: rec
38
+ algorithm: CPPD
39
+ in_channels: 3
40
+ Transform:
41
+ Encoder:
42
+ name: SVTRNet
43
+ img_size: [32, 128]
44
+ out_char_num: 25
45
+ out_channels: 256
46
+ patch_merging: 'Conv'
47
+ embed_dim: [128, 256, 384]
48
+ depth: [6, 6, 6]
49
+ num_heads: [4, 8, 12]
50
+ mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
51
+ local_mixer: [[5, 5], [5, 5], [5, 5]]
52
+ last_stage: False
53
+ prenorm: True
54
+ Decoder:
55
+ name: CPPDDecoder
56
+ vis_seq: 64
57
+ num_layer: 2
58
+ pos_len: False
59
+ rec_layer: 1
60
+
61
+
62
+ Loss:
63
+ name: CPPDLoss
64
+ ignore_index: 100
65
+ smoothing: True
66
+ pos_len: False
67
+ sideloss_weight: 1.0
68
+
69
+ PostProcess:
70
+ name: CPPDLabelDecode
71
+ character_dict_path: *character_dict_path
72
+ use_space_char: *use_space_char
73
+
74
+ Metric:
75
+ name: RecMetric
76
+ main_indicator: acc
77
+
78
+ Train:
79
+ dataset:
80
+ name: LMDBDataSet
81
+ data_dir: ../Union14M-L-LMDB-Filtered
82
+ transforms:
83
+ - DecodeImagePIL: # load image
84
+ img_mode: RGB
85
+ - PARSeqAugPIL:
86
+ - CPPDLabelEncode: # Class handling label
87
+ pos_len: False
88
+ character_dict_path: *character_dict_path
89
+ use_space_char: *use_space_char
90
+ max_text_length: *max_text_length
91
+ - RecTVResize:
92
+ image_shape: [32, 128]
93
+ padding: False
94
+ - KeepKeys:
95
+ keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
96
+ loader:
97
+ shuffle: True
98
+ batch_size_per_card: 256
99
+ drop_last: True
100
+ num_workers: 4
101
+
102
+ Eval:
103
+ dataset:
104
+ name: LMDBDataSet
105
+ data_dir: ../evaluation/
106
+ transforms:
107
+ - DecodeImagePIL: # load image
108
+ img_mode: RGB
109
+ - CPPDLabelEncode: # Class handling label
110
+ pos_len: False
111
+ character_dict_path: *character_dict_path
112
+ use_space_char: *use_space_char
113
+ max_text_length: *max_text_length
114
+ - RecTVResize:
115
+ image_shape: [32, 128]
116
+ padding: False
117
+ - KeepKeys:
118
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
119
+ loader:
120
+ shuffle: False
121
+ drop_last: False
122
+ batch_size_per_card: 128
123
+ num_workers: 4
configs/rec/cppd/svtr_base_cppd_ch.yml ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 100
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/ch/svtr_base_cppd/
7
+ save_epoch_step: [15, 1]
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 2000]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: False
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img:
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path ./tools/utils/ppocr_keys_v1.txt
18
+ # ./tools/utils/EN_symbol_dict.txt # 96en
19
+ # ./tools/utils/ppocr_keys_v1.txt # ch
20
+ max_text_length: &max_text_length 25
21
+ use_space_char: &use_space_char False
22
+ save_res_path: ./output/rec/ch/predicts_svtr_base_cppd.txt
23
+ use_amp: True
24
+
25
+ Optimizer:
26
+ name: AdamW
27
+ lr: 0.0005 # for 4gpus bs128/gpu
28
+ weight_decay: 0.05
29
+ filter_bias_and_bn: True
30
+
31
+ LRScheduler:
32
+ name: CosineAnnealingLR
33
+ warmup_epoch: 5
34
+
35
+ Architecture:
36
+ model_type: rec
37
+ algorithm: CPPD
38
+ in_channels: 3
39
+ Transform:
40
+ Encoder:
41
+ name: SVTRNet
42
+ img_size: [32, 256]
43
+ patch_merging: 'Conv'
44
+ embed_dim: [128, 256, 384]
45
+ depth: [6, 6, 4]
46
+ num_heads: [4, 8, 12]
47
+ mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
48
+ local_mixer: [[5, 5], [5, 5], [5, 5]]
49
+ last_stage: False
50
+ prenorm: True
51
+ Decoder:
52
+ name: CPPDDecoder
53
+ vis_seq: 128
54
+ num_layer: 3
55
+ pos_len: False
56
+ rec_layer: 1
57
+ ch: True
58
+
59
+
60
+ Loss:
61
+ name: CPPDLoss
62
+ ignore_index: 7000
63
+ smoothing: True
64
+ pos_len: False
65
+ sideloss_weight: 1.0
66
+
67
+ PostProcess:
68
+ name: CPPDLabelDecode
69
+ character_dict_path: *character_dict_path
70
+ use_space_char: *use_space_char
71
+
72
+ Metric:
73
+ name: RecMetric
74
+ main_indicator: acc
75
+
76
+ Train:
77
+ dataset:
78
+ name: LMDBDataSet
79
+ data_dir: ../benchmark_bctr/benchmark_bctr_train
80
+ transforms:
81
+ - DecodeImage: # load image
82
+ img_mode: BGR
83
+ channel_first: False
84
+ - CPPDLabelEncode: # Class handling label
85
+ pos_len: False
86
+ ch: True
87
+ ignore_index: 7000
88
+ character_dict_path: *character_dict_path
89
+ use_space_char: *use_space_char
90
+ max_text_length: *max_text_length
91
+ - SVTRResize:
92
+ image_shape: [3, 32, 256]
93
+ padding: True
94
+ - KeepKeys:
95
+ keep_keys: ['image', 'label', 'label_node', 'label_index', 'length'] # dataloader will return list in this order
96
+ loader:
97
+ shuffle: True
98
+ batch_size_per_card: 128
99
+ drop_last: True
100
+ num_workers: 8
101
+
102
+ Eval:
103
+ dataset:
104
+ name: LMDBDataSet
105
+ data_dir: ../benchmark_bctr/benchmark_bctr_test/scene_test
106
+ transforms:
107
+ - DecodeImage: # load image
108
+ img_mode: BGR
109
+ channel_first: False
110
+ - CPPDLabelEncode: # Class handling label
111
+ pos_len: False
112
+ ch: True
113
+ ignore_index: 7000
114
+ character_dict_path: *character_dict_path
115
+ use_space_char: *use_space_char
116
+ max_text_length: *max_text_length
117
+ - SVTRResize:
118
+ image_shape: [3, 32, 256]
119
+ padding: True
120
+ - KeepKeys:
121
+ keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
122
+ loader:
123
+ shuffle: False
124
+ drop_last: False
125
+ batch_size_per_card: 256
126
+ num_workers: 4
configs/rec/cppd/svtr_base_cppd_h8.yml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtr_base_h8_cppd/
7
+ save_epoch_step: [15, 1]
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img:
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
18
+ # ./tools/utils/ppocr_keys_v1.txt # ch
19
+ max_text_length: &max_text_length 25
20
+ use_space_char: &use_space_char False
21
+ save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_cppd.txt
22
+ use_amp: True
23
+
24
+ Optimizer:
25
+ name: AdamW
26
+ lr: 0.00065 # for 4gpus bs256/gpu
27
+ weight_decay: 0.05
28
+ filter_bias_and_bn: True
29
+
30
+ LRScheduler:
31
+ name: OneCycleLR
32
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
33
+ cycle_momentum: False
34
+
35
+ Architecture:
36
+ model_type: rec
37
+ algorithm: CPPD
38
+ in_channels: 3
39
+ Transform:
40
+ Encoder:
41
+ name: SVTRNet
42
+ img_size: [32, 128]
43
+ out_char_num: 25
44
+ out_channels: 256
45
+ patch_merging: 'Conv'
46
+ embed_dim: [128, 256, 384]
47
+ depth: [6, 6, 6]
48
+ num_heads: [4, 8, 12]
49
+ sub_k: [[1, 1], [2, 1]]
50
+ mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
51
+ local_mixer: [[5, 5], [5, 5], [5, 5]]
52
+ last_stage: False
53
+ prenorm: True
54
+ Decoder:
55
+ name: CPPDDecoder
56
+ vis_seq: 128
57
+ num_layer: 2
58
+ pos_len: False
59
+ rec_layer: 1
60
+
61
+ Loss:
62
+ name: CPPDLoss
63
+ ignore_index: 100
64
+ smoothing: True
65
+ pos_len: False
66
+ sideloss_weight: 1.0
67
+
68
+ PostProcess:
69
+ name: CPPDLabelDecode
70
+ character_dict_path: *character_dict_path
71
+ use_space_char: *use_space_char
72
+
73
+ Metric:
74
+ name: RecMetric
75
+ main_indicator: acc
76
+ is_filter: True
77
+
78
+ Train:
79
+ dataset:
80
+ name: LMDBDataSet
81
+ data_dir: ../Union14M-L-LMDB-Filtered
82
+ transforms:
83
+ - DecodeImagePIL: # load image
84
+ img_mode: RGB
85
+ - PARSeqAugPIL:
86
+ - CPPDLabelEncode: # Class handling label
87
+ pos_len: False
88
+ character_dict_path: *character_dict_path
89
+ use_space_char: *use_space_char
90
+ max_text_length: *max_text_length
91
+ - RecTVResize:
92
+ image_shape: [32, 128]
93
+ padding: False
94
+ - KeepKeys:
95
+ keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
96
+ loader:
97
+ shuffle: True
98
+ batch_size_per_card: 256
99
+ drop_last: True
100
+ num_workers: 4
101
+
102
+ Eval:
103
+ dataset:
104
+ name: LMDBDataSet
105
+ data_dir: ../evaluation/
106
+ transforms:
107
+ - DecodeImagePIL: # load image
108
+ img_mode: RGB
109
+ - CPPDLabelEncode: # Class handling label
110
+ pos_len: False
111
+ character_dict_path: *character_dict_path
112
+ use_space_char: *use_space_char
113
+ max_text_length: *max_text_length
114
+ - RecTVResize:
115
+ image_shape: [32, 128]
116
+ padding: False
117
+ - KeepKeys:
118
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
119
+ loader:
120
+ shuffle: False
121
+ drop_last: False
122
+ batch_size_per_card: 128
123
+ num_workers: 4
configs/rec/cppd/svtr_base_cppd_syn.yml ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 60
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/syn/svtr_base_cppd/
7
+ save_epoch_step: [15, 1]
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img:
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path
18
+ # ./tools/utils/EN_symbol_dict.txt # 96en
19
+ # ./tools/utils/ppocr_keys_v1.txt # ch
20
+ max_text_length: &max_text_length 25
21
+ use_space_char: &use_space_char False
22
+ save_res_path: ./output/rec/syn/predicts_svtr_base_cppd.txt
23
+ use_amp: True
24
+
25
+ Optimizer:
26
+ name: AdamW
27
+ lr: 0.0005 # for 4gpus bs256/gpu
28
+ weight_decay: 0.05
29
+ filter_bias_and_bn: True
30
+
31
+ LRScheduler:
32
+ name: CosineAnnealingLR
33
+ warmup_epoch: 6
34
+
35
+ Architecture:
36
+ model_type: rec
37
+ algorithm: CPPD
38
+ in_channels: 3
39
+ Transform:
40
+ Encoder:
41
+ name: SVTRNet
42
+ img_size: [32, 100]
43
+ out_char_num: 25
44
+ out_channels: 256
45
+ patch_merging: 'Conv'
46
+ embed_dim: [128, 256, 384]
47
+ depth: [6, 6, 4]
48
+ num_heads: [4, 8, 12]
49
+ mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
50
+ local_mixer: [[5, 5], [5, 5], [5, 5]]
51
+ last_stage: False
52
+ prenorm: True
53
+ Decoder:
54
+ name: CPPDDecoder
55
+ vis_seq: 50
56
+ num_layer: 3
57
+ pos_len: False
58
+ rec_layer: 1
59
+
60
+
61
+ Loss:
62
+ name: CPPDLoss
63
+ ignore_index: 100
64
+ smoothing: True
65
+ pos_len: False
66
+ sideloss_weight: 1.0
67
+
68
+ PostProcess:
69
+ name: CPPDLabelDecode
70
+ character_dict_path: *character_dict_path
71
+ use_space_char: *use_space_char
72
+
73
+ Metric:
74
+ name: RecMetric
75
+ main_indicator: acc
76
+
77
+ Train:
78
+ dataset:
79
+ name: STRLMDBDataSet
80
+ data_dir: ./
81
+ transforms:
82
+ - DecodeImage: # load image
83
+ img_mode: BGR
84
+ channel_first: False
85
+ # - SVTRRAug:
86
+ - CPPDLabelEncode: # Class handling label
87
+ pos_len: False
88
+ character_dict_path: *character_dict_path
89
+ use_space_char: *use_space_char
90
+ max_text_length: *max_text_length
91
+ - SVTRResize:
92
+ image_shape: [3, 32, 100]
93
+ padding: False
94
+ - KeepKeys:
95
+ keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
96
+ loader:
97
+ shuffle: True
98
+ batch_size_per_card: 256
99
+ drop_last: True
100
+ num_workers: 8
101
+
102
+ Eval:
103
+ dataset:
104
+ name: LMDBDataSet
105
+ data_dir: ../evaluation/
106
+ transforms:
107
+ - DecodeImage: # load image
108
+ img_mode: BGR
109
+ channel_first: False
110
+ - CPPDLabelEncode: # Class handling label
111
+ pos_len: False
112
+ character_dict_path: *character_dict_path
113
+ use_space_char: *use_space_char
114
+ max_text_length: *max_text_length
115
+ - SVTRResize:
116
+ image_shape: [3, 32, 100]
117
+ padding: False
118
+ - KeepKeys:
119
+ keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
120
+ loader:
121
+ shuffle: False
122
+ drop_last: False
123
+ batch_size_per_card: 256
124
+ num_workers: 4
configs/rec/cppd/svtrv2_cppd.yml ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_cppd/
7
+ save_epoch_step: [15, 1]
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img:
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
18
+ # ./tools/utils/ppocr_keys_v1.txt # ch
19
+ max_text_length: &max_text_length 25
20
+ use_space_char: &use_space_char False
21
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_cppd.txt
22
+ use_amp: True
23
+
24
+ Optimizer:
25
+ name: AdamW
26
+ lr: 0.00065 # for 4gpus bs256/gpu
27
+ weight_decay: 0.05
28
+ filter_bias_and_bn: True
29
+
30
+ LRScheduler:
31
+ name: OneCycleLR
32
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
33
+ cycle_momentum: False
34
+
35
+ Architecture:
36
+ model_type: rec
37
+ algorithm: CPPD
38
+ in_channels: 3
39
+ Transform:
40
+ Encoder:
41
+ name: SVTRv2LNConvTwo33
42
+ use_pos_embed: False
43
+ out_channels: 256
44
+ dims: [128, 256, 384]
45
+ depths: [6, 6, 6]
46
+ num_heads: [4, 8, 12]
47
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
48
+ local_k: [[5, 5], [5, 5], [-1, -1]]
49
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
50
+ last_stage: false
51
+ feat2d: False
52
+ Decoder:
53
+ name: CPPDDecoder
54
+ ds: True
55
+ num_layer: 2
56
+ pos_len: False
57
+ rec_layer: 1
58
+
59
+
60
+ Loss:
61
+ name: CPPDLoss
62
+ ignore_index: 100
63
+ smoothing: True
64
+ pos_len: False
65
+ sideloss_weight: 1.0
66
+
67
+ PostProcess:
68
+ name: CPPDLabelDecode
69
+ character_dict_path: *character_dict_path
70
+ use_space_char: *use_space_char
71
+
72
+ Metric:
73
+ name: RecMetric
74
+ main_indicator: acc
75
+ is_filter: True
76
+
77
+ Train:
78
+ dataset:
79
+ name: RatioDataSetTVResize
80
+ ds_width: True
81
+ padding: false
82
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
83
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
84
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
85
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
86
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
87
+ ]
88
+ transforms:
89
+ - DecodeImagePIL: # load image
90
+ img_mode: RGB
91
+ - PARSeqAugPIL:
92
+ - CPPDLabelEncode: # Class handling label
93
+ pos_len: False
94
+ character_dict_path: *character_dict_path
95
+ use_space_char: *use_space_char
96
+ max_text_length: *max_text_length
97
+ - KeepKeys:
98
+ keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
99
+ sampler:
100
+ name: RatioSampler
101
+ scales: [[128, 32]] # w, h
102
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
103
+ first_bs: &bs 256
104
+ fix_bs: false
105
+ divided_factor: [4, 16] # w, h
106
+ is_training: True
107
+ loader:
108
+ shuffle: True
109
+ batch_size_per_card: *bs
110
+ drop_last: True
111
+ max_ratio: &max_ratio 4
112
+ num_workers: 4
113
+
114
+ Eval:
115
+ dataset:
116
+ name: RatioDataSetTVResize
117
+ ds_width: True
118
+ padding: False
119
+ data_dir_list: [
120
+ '../evaluation/CUTE80',
121
+ '../evaluation/IC13_857',
122
+ '../evaluation/IC15_1811',
123
+ '../evaluation/IIIT5k',
124
+ '../evaluation/SVT',
125
+ '../evaluation/SVTP',
126
+ ]
127
+ transforms:
128
+ - DecodeImagePIL: # load image
129
+ img_mode: RGB
130
+ - CPPDLabelEncode: # Class handling label
131
+ pos_len: False
132
+ character_dict_path: *character_dict_path
133
+ use_space_char: *use_space_char
134
+ max_text_length: *max_text_length
135
+ - KeepKeys:
136
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
137
+ sampler:
138
+ name: RatioSampler
139
+ scales: [[128, 32]] # w, h
140
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
141
+ first_bs: *bs
142
+ fix_bs: false
143
+ divided_factor: [4, 16] # w, h
144
+ is_training: False
145
+ loader:
146
+ shuffle: False
147
+ drop_last: False
148
+ batch_size_per_card: *bs
149
+ max_ratio: *max_ratio
150
+ num_workers: 4
configs/rec/dan/resnet45_fpn_dan.yml ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/resnet45_fpn_dan/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_resnet45_fpn_dan.txt
19
+ use_amp: True
20
+ grad_clip_val: 20
21
+
22
+ Optimizer:
23
+ name: Adam
24
+ lr: 0.00065 # for 4gpus bs256/gpu
25
+ weight_decay: 0.0
26
+ filter_bias_and_bn: False
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: DAN
36
+ Transform:
37
+ Encoder:
38
+ name: ResNet45
39
+ in_channels: 3
40
+ strides: [2, 1, 2, 1, 1]
41
+ return_list: True
42
+ Decoder:
43
+ name: DANDecoder
44
+ max_len: 25
45
+ channels_list: [64, 128, 256, 512]
46
+ strides_list: [[2, 2], [1, 1], [1, 1]]
47
+ in_shape: [8, 32]
48
+ depth: 4
49
+
50
+ Loss:
51
+ name: ARLoss
52
+
53
+ PostProcess:
54
+ name: ARLabelDecode
55
+
56
+ Metric:
57
+ name: RecMetric
58
+ main_indicator: acc
59
+ is_filter: True
60
+
61
+ Train:
62
+ dataset:
63
+ name: LMDBDataSet
64
+ data_dir: ../Union14M-L-LMDB-Filtered
65
+ transforms:
66
+ - DecodeImagePIL: # load image
67
+ img_mode: RGB
68
+ - PARSeqAugPIL:
69
+ - ARLabelEncode:
70
+ - RecTVResize:
71
+ image_shape: [32, 128]
72
+ padding: False
73
+ - KeepKeys:
74
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
75
+ loader:
76
+ shuffle: True
77
+ batch_size_per_card: 256
78
+ drop_last: True
79
+ num_workers: 4
80
+
81
+ Eval:
82
+ dataset:
83
+ name: LMDBDataSet
84
+ data_dir: ../evaluation
85
+ transforms:
86
+ - DecodeImagePIL: # load image
87
+ img_mode: RGB
88
+ - ARLabelEncode:
89
+ - RecTVResize:
90
+ image_shape: [32, 128]
91
+ padding: False
92
+ - KeepKeys:
93
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
94
+ loader:
95
+ shuffle: False
96
+ drop_last: False
97
+ batch_size_per_card: 256
98
+ num_workers: 2
configs/rec/dan/svtrv2_dan.yml ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_dan
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: 25
17
+ use_space_char: False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_dan.txt
19
+ use_amp: True
20
+ grad_clip_val: 20
21
+
22
+ Optimizer:
23
+ name: AdamW
24
+ lr: 0.00065 # 4gpus 256bs/gpu
25
+ weight_decay: 0.05
26
+ filter_bias_and_bn: True
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: DAN
36
+ Transform:
37
+ Encoder:
38
+ name: SVTRv2LNConvTwo33
39
+ use_pos_embed: False
40
+ out_channels: 256
41
+ dims: [128, 256, 384]
42
+ depths: [6, 6, 6]
43
+ num_heads: [4, 8, 12]
44
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
45
+ local_k: [[5, 5], [5, 5], [-1, -1]]
46
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
47
+ last_stage: false
48
+ feat2d: True
49
+ Decoder:
50
+ name: DANDecoder
51
+ use_cam: False
52
+ max_len: 25
53
+
54
+ Loss:
55
+ name: ARLoss
56
+
57
+ PostProcess:
58
+ name: ARLabelDecode
59
+
60
+ Metric:
61
+ name: RecMetric
62
+ main_indicator: acc
63
+ is_filter: True
64
+
65
+ Train:
66
+ dataset:
67
+ name: RatioDataSetTVResize
68
+ ds_width: True
69
+ padding: false
70
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_filter_train_challenging',
71
+ '../Union14M-L-LMDB-Filtered/filter_filter_train_hard',
72
+ '../Union14M-L-LMDB-Filtered/filter_filter_train_medium',
73
+ '../Union14M-L-LMDB-Filtered/filter_filter_train_normal',
74
+ '../Union14M-L-LMDB-Filtered/filter_filter_train_easy',
75
+ ]
76
+ transforms:
77
+ - DecodeImagePIL: # load image
78
+ img_mode: RGB
79
+ - PARSeqAugPIL:
80
+ - ARLabelEncode:
81
+ - KeepKeys:
82
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
83
+ sampler:
84
+ name: RatioSampler
85
+ scales: [[128, 32]] # w, h
86
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
87
+ first_bs: &bs 256
88
+ fix_bs: false
89
+ divided_factor: [4, 16] # w, h
90
+ is_training: True
91
+ loader:
92
+ shuffle: True
93
+ batch_size_per_card: *bs
94
+ drop_last: True
95
+ max_ratio: &max_ratio 4
96
+ num_workers: 4
97
+
98
+ Eval:
99
+ dataset:
100
+ name: RatioDataSetTVResize
101
+ ds_width: True
102
+ padding: False
103
+ data_dir_list: [
104
+ '../evaluation/CUTE80',
105
+ '../evaluation/IC13_857',
106
+ '../evaluation/IC15_1811',
107
+ '../evaluation/IIIT5k',
108
+ '../evaluation/SVT',
109
+ '../evaluation/SVTP',
110
+ ]
111
+ transforms:
112
+ - DecodeImagePIL: # load image
113
+ img_mode: RGB
114
+ - ARLabelEncode:
115
+ - KeepKeys:
116
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
117
+ sampler:
118
+ name: RatioSampler
119
+ scales: [[128, 32]] # w, h
120
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
121
+ first_bs: *bs
122
+ fix_bs: false
123
+ divided_factor: [4, 16] # w, h
124
+ is_training: False
125
+ loader:
126
+ shuffle: False
127
+ drop_last: False
128
+ batch_size_per_card: *bs
129
+ max_ratio: *max_ratio
130
+ num_workers: 4
configs/rec/dptr/dptr_parseq_pretrain.yml ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: /share/ckpt/zhaoshuai/openocr/dptr_parseq/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: &max_text_length 25
17
+ use_space_char: &use_space_char False
18
+ use_amp: True
19
+ save_res_path: /share/ckpt/zhaoshuai/openocr/dptr_parseq/predicts_dptr_parseq.txt
20
+ grad_clip_val: 20
21
+
22
+ Optimizer:
23
+ name: AdamW
24
+ lr: 0.001485 # 2gpus 384bs/gpu
25
+ weight_decay: 0.
26
+ filter_bias_and_bn: False
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: DPTR
36
+ Decoder:
37
+ name: DptrParseq
38
+ decode_ar: True
39
+ refine_iters: 1
40
+ is_pretrain: True
41
+ ORP_path: /share/ckpt/zhaoshuai/parseq/clip_background.pth
42
+
43
+ Loss:
44
+ name: PARSeqLoss
45
+
46
+ PostProcess:
47
+ name: ARLabelDecode
48
+ character_dict_path: *character_dict_path
49
+ use_space_char: *use_space_char
50
+
51
+ Metric:
52
+ name: RecMetric
53
+ main_indicator: acc
54
+ is_filter: True
55
+
56
+ Train:
57
+ dataset:
58
+ name: TextLMDBDataSet
59
+ data_dir: /share/test/zhaoshuai/parseq-data/data/train/real/ArT
60
+ transforms:
61
+ - DPTRLabelEncode: # Class handling label
62
+ character_dict_path: *character_dict_path
63
+ use_space_char: *use_space_char
64
+ max_text_length: *max_text_length
65
+ - KeepKeys:
66
+ keep_keys: ['clip_label', 'label'] # dataloader will return list in this order
67
+ loader:
68
+ shuffle: True
69
+ batch_size_per_card: 256
70
+ drop_last: True
71
+ num_workers: 4
72
+
73
+ Eval:
74
+ dataset:
75
+ name: TextLMDBDataSet
76
+ data_dir: /share/test/zhaoshuai/parseq-data/data/val
77
+ transforms:
78
+ - DPTRLabelEncode: # Class handling label
79
+ character_dict_path: *character_dict_path
80
+ use_space_char: *use_space_char
81
+ max_text_length: *max_text_length
82
+ - KeepKeys:
83
+ keep_keys: ['clip_label', 'label'] # dataloader will return list in this order
84
+ loader:
85
+ shuffle: False
86
+ drop_last: False
87
+ batch_size_per_card: 256
88
+ num_workers: 2
configs/rec/focalsvtr/focalsvtr_ctc.yml ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/focalsvtr_ctc/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: &character_dict_path
16
+ # ./tools/utils/EN_symbol_dict.txt
17
+ max_text_length: &max_text_length 25
18
+ use_space_char: &use_space_char False
19
+ save_res_path: ./output/rec/u14m_filter/predicts_focalsvtr_ctc.txt
20
+
21
+
22
+ Optimizer:
23
+ name: AdamW
24
+ lr: 0.00065 # for 4gpus bs256/gpu
25
+ weight_decay: 0.05
26
+ filter_bias_and_bn: True
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+
31
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
32
+ cycle_momentum: False
33
+
34
+ Architecture:
35
+ model_type: rec
36
+ algorithm: SVTR
37
+ Transform:
38
+ Encoder:
39
+ name: FocalSVTR
40
+ img_size: [32, 128]
41
+ depths: [6, 6, 6]
42
+ embed_dim: 96
43
+ sub_k: [[1, 1], [2, 1], [1, 1]]
44
+ focal_levels: [3, 3, 3]
45
+ out_channels: 256
46
+ last_stage: True
47
+ Decoder:
48
+ name: CTCDecoder
49
+
50
+ Loss:
51
+ name: CTCLoss
52
+ zero_infinity: True
53
+
54
+ PostProcess:
55
+ name: CTCLabelDecode
56
+ character_dict_path: *character_dict_path
57
+ use_space_char: *use_space_char
58
+
59
+ Metric:
60
+ name: RecMetric
61
+ main_indicator: acc
62
+ is_filter: True
63
+
64
+
65
+ Train:
66
+ dataset:
67
+ name: RatioDataSet
68
+ ds_width: True
69
+ padding: &padding False
70
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
71
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
72
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
73
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
74
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
75
+ ]
76
+ transforms:
77
+ - DecodeImage: # load image
78
+ img_mode: BGR
79
+ channel_first: False
80
+ - PARSeqAug:
81
+ - CTCLabelEncode: # Class handling label
82
+ character_dict_path: *character_dict_path
83
+ use_space_char: *use_space_char
84
+ max_text_length: *max_text_length
85
+ - KeepKeys:
86
+ keep_keys: ['image', 'label', 'length']
87
+ sampler:
88
+ name: RatioSampler
89
+ scales: [[128, 32]] # w, h
90
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
91
+ first_bs: &bs 256
92
+ fix_bs: false
93
+ divided_factor: [4, 16] # w, h
94
+ is_training: True
95
+ loader:
96
+ shuffle: True
97
+ batch_size_per_card: *bs
98
+ drop_last: True
99
+ max_ratio: 12
100
+ num_workers: 4
101
+
102
+ Eval:
103
+ dataset:
104
+ name: RatioDataSet
105
+ ds_width: True
106
+ padding: True
107
+ data_dir_list: ['../evaluation/CUTE80',
108
+ '../evaluation/IC13_857',
109
+ '../evaluation/IC15_1811',
110
+ '../evaluation/IIIT5k',
111
+ '../evaluation/SVT',
112
+ '../evaluation/SVTP',
113
+ ]
114
+ transforms:
115
+ - DecodeImage: # load image
116
+ img_mode: BGR
117
+ channel_first: False
118
+ - CTCLabelEncode: # Class handling label
119
+ character_dict_path: *character_dict_path
120
+ use_space_char: *use_space_char
121
+ max_text_length: *max_text_length
122
+ - KeepKeys:
123
+ keep_keys: ['image', 'label', 'length']
124
+ sampler:
125
+ name: RatioSampler
126
+ scales: [[128, 32]] # w, h
127
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
128
+ first_bs: 128
129
+ fix_bs: false
130
+ divided_factor: [4, 16] # w, h
131
+ is_training: False
132
+ loader:
133
+ shuffle: False
134
+ drop_last: False
135
+ batch_size_per_card: 128
136
+ max_ratio: 12
137
+ num_workers: 4
configs/rec/gtc/svtrv2_lnconv_nrtr_gtc.yml ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/svtrv2_lnconv_nrtr_gtc
7
+ save_epoch_step: [15, 1]
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img: ../ltb/img
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
18
+ # ./tools/utils/ppocr_keys_v1.txt # ch
19
+ max_text_length: &max_text_length 25
20
+ use_space_char: &use_space_char False
21
+ save_res_path: ./output/rec/predicts_smtr.txt
22
+ use_amp: True
23
+ distributed: true
24
+
25
+ Optimizer:
26
+ name: AdamW
27
+ lr: 0.00065
28
+ weight_decay: 0.05
29
+ filter_bias_and_bn: True
30
+
31
+ LRScheduler:
32
+ name: OneCycleLR
33
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
34
+ cycle_momentum: False
35
+
36
+ Architecture:
37
+ model_type: rec
38
+ algorithm: BGPD
39
+ in_channels: 3
40
+ Transform:
41
+ Encoder:
42
+ name: SVTRv2LNConvTwo33
43
+ use_pos_embed: False
44
+ out_channels: 256
45
+ dims: [128, 256, 384]
46
+ depths: [6, 6, 6]
47
+ num_heads: [4, 8, 12]
48
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
49
+ local_k: [[5, 5], [5, 5], [-1, -1]]
50
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
51
+ last_stage: false
52
+ feat2d: True
53
+ Decoder:
54
+ name: GTCDecoder
55
+ infer_gtc: True
56
+ detach: False
57
+ gtc_decoder:
58
+ name: NRTRDecoder
59
+ num_encoder_layers: -1
60
+ beam_size: 0
61
+ num_decoder_layers: 2
62
+ nhead: 12
63
+ max_len: *max_text_length
64
+ ctc_decoder:
65
+ name: RCTCDecoder
66
+
67
+ Loss:
68
+ name: GTCLoss
69
+ gtc_loss:
70
+ name: ARLoss
71
+
72
+ PostProcess:
73
+ name: GTCLabelDecode
74
+ gtc_label_decode:
75
+ name: ARLabelDecode
76
+ character_dict_path: *character_dict_path
77
+ use_space_char: *use_space_char
78
+
79
+ Metric:
80
+ name: RecGTCMetric
81
+ main_indicator: acc
82
+ is_filter: True
83
+
84
+ Train:
85
+ dataset:
86
+ name: RatioDataSet
87
+ ds_width: True
88
+ # max_ratio: &max_ratio 4
89
+ # min_ratio: 1
90
+ # base_shape: &base_shape [[64, 64], [96, 48], [112, 40], [128, 32]]
91
+ # base_h: &base_h 32
92
+ # padding: &padding False
93
+ padding: false
94
+ # padding_rand: true
95
+ # padding_doub: true
96
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
97
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
98
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
99
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
100
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
101
+ ]
102
+ transforms:
103
+ - DecodeImage: # load image
104
+ img_mode: BGR
105
+ channel_first: False
106
+ - PARSeqAug:
107
+ - GTCLabelEncode: # Class handling label
108
+ gtc_label_encode:
109
+ name: ARLabelEncode
110
+ character_dict_path: *character_dict_path
111
+ use_space_char: *use_space_char
112
+ max_text_length: *max_text_length
113
+ - KeepKeys:
114
+ keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
115
+ sampler:
116
+ name: RatioSampler
117
+ scales: [[128, 32]] # w, h
118
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
119
+ first_bs: &bs 256
120
+ fix_bs: false
121
+ divided_factor: [4, 16] # w, h
122
+ is_training: True
123
+ loader:
124
+ shuffle: True
125
+ batch_size_per_card: *bs
126
+ drop_last: True
127
+ max_ratio: &max_ratio 4
128
+ num_workers: 4
129
+
130
+ Eval:
131
+ dataset:
132
+ name: RatioDataSet
133
+ ds_width: True
134
+ padding: False
135
+ data_dir_list: [
136
+ '../evaluation/CUTE80',
137
+ '../evaluation/IC13_857',
138
+ '../evaluation/IC15_1811',
139
+ '../evaluation/IIIT5k',
140
+ '../evaluation/SVT',
141
+ '../evaluation/SVTP',
142
+ ]
143
+ transforms:
144
+ - DecodeImage: # load image
145
+ img_mode: BGR
146
+ channel_first: False
147
+ - GTCLabelEncode: # Class handling label
148
+ gtc_label_encode:
149
+ name: ARLabelEncode
150
+ character_dict_path: *character_dict_path
151
+ use_space_char: *use_space_char
152
+ max_text_length: *max_text_length
153
+ - KeepKeys:
154
+ keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
155
+ sampler:
156
+ name: RatioSampler
157
+ scales: [[128, 32]] # w, h
158
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
159
+ first_bs: *bs
160
+ fix_bs: false
161
+ divided_factor: [4, 16] # w, h
162
+ is_training: False
163
+ loader:
164
+ shuffle: False
165
+ drop_last: False
166
+ batch_size_per_card: *bs
167
+ max_ratio: *max_ratio
168
+ num_workers: 4
configs/rec/gtc/svtrv2_lnconv_smtr_gtc_long_infer.yml ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/svtrv2_lnconv_smtr_gtc_long_infer
7
+ save_epoch_step: [15, 1]
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 1000]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img: ../ltb/img
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
18
+ # ./tools/utils/ppocr_keys_v1.txt # ch
19
+ max_text_length: &max_text_length 25
20
+ use_space_char: &use_space_char False
21
+ save_res_path: ./output/rec/predicts_smtr.txt
22
+ use_amp: True
23
+ distributed: true
24
+
25
+ Optimizer:
26
+ name: AdamW
27
+ lr: 0.000325
28
+ weight_decay: 0.05
29
+ filter_bias_and_bn: True
30
+
31
+ LRScheduler:
32
+ name: OneCycleLR
33
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
34
+ cycle_momentum: False
35
+
36
+ Architecture:
37
+ model_type: rec
38
+ algorithm: BGPD
39
+ in_channels: 3
40
+ Transform:
41
+ Encoder:
42
+ name: SVTRv2LNConvTwo33
43
+ use_pos_embed: False
44
+ out_channels: 256
45
+ dims: [128, 256, 384]
46
+ depths: [6, 6, 6]
47
+ num_heads: [4, 8, 12]
48
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
49
+ local_k: [[5, 5], [5, 5], [-1, -1]]
50
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
51
+ last_stage: false
52
+ feat2d: True
53
+ Decoder:
54
+ name: GTCDecoder
55
+ infer_gtc: False
56
+ detach: False
57
+ gtc_decoder:
58
+ name: SMTRDecoder
59
+ num_layer: 1
60
+ ds: True
61
+ max_len: *max_text_length
62
+ next_mode: &next True
63
+ sub_str_len: &subsl 5
64
+ ctc_decoder:
65
+ name: RCTCDecoder
66
+
67
+ Loss:
68
+ name: CTCLoss
69
+
70
+ PostProcess:
71
+ name: CTCLabelDecode
72
+ character_dict_path: *character_dict_path
73
+ use_space_char: *use_space_char
74
+
75
+ Metric:
76
+ name: RecMetric
77
+ main_indicator: acc
78
+ is_filter: True
79
+
80
+ Train:
81
+ dataset:
82
+ name: RatioDataSetTVResize
83
+ ds_width: True
84
+ padding: false
85
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
86
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
87
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
88
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
89
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
90
+ ]
91
+ transforms:
92
+ - DecodeImagePIL: # load image
93
+ img_mode: RGB
94
+ - PARSeqAugPIL:
95
+ - CTCLabelEncode: # Class handling label
96
+ character_dict_path: *character_dict_path
97
+ use_space_char: *use_space_char
98
+ max_text_length: *max_text_length
99
+ - KeepKeys:
100
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
101
+ sampler:
102
+ name: RatioSampler
103
+ scales: [[128, 32]] # w, h
104
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
105
+ first_bs: &bs 128
106
+ fix_bs: false
107
+ divided_factor: [4, 16] # w, h
108
+ is_training: True
109
+ loader:
110
+ shuffle: True
111
+ batch_size_per_card: *bs
112
+ drop_last: True
113
+ max_ratio: &max_ratio 12
114
+ num_workers: 4
115
+
116
+ Eval:
117
+ dataset:
118
+ name: RatioDataSetTVResize
119
+ ds_width: True
120
+ padding: False
121
+ data_dir_list: [
122
+ '../evaluation/CUTE80',
123
+ '../evaluation/IC13_857',
124
+ '../evaluation/IC15_1811',
125
+ '../evaluation/IIIT5k',
126
+ '../evaluation/SVT',
127
+ '../evaluation/SVTP',
128
+ ]
129
+ transforms:
130
+ - DecodeImagePIL: # load image
131
+ img_mode: RGB
132
+ - CTCLabelEncode: # Class handling label
133
+ character_dict_path: *character_dict_path
134
+ use_space_char: *use_space_char
135
+ max_text_length: *max_text_length
136
+ - KeepKeys:
137
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
138
+ sampler:
139
+ name: RatioSampler
140
+ scales: [[128, 32]] # w, h
141
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
142
+ first_bs: *bs
143
+ fix_bs: false
144
+ divided_factor: [4, 16] # w, h
145
+ is_training: False
146
+ loader:
147
+ shuffle: False
148
+ drop_last: False
149
+ batch_size_per_card: *bs
150
+ max_ratio: *max_ratio
151
+ num_workers: 4
configs/rec/gtc/svtrv2_lnconv_smtr_gtc_smtr_long.yml ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/svtrv2_lnconv_smtr_gtc_nodetach_smtr_long_infer
7
+ save_epoch_step: [15, 1]
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 1000]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img:
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
18
+ # ./tools/utils/ppocr_keys_v1.txt # ch
19
+ max_text_length: &max_text_length 25
20
+ use_space_char: &use_space_char False
21
+ save_res_path: ./output/rec/predicts_smtr.txt
22
+ use_amp: True
23
+ distributed: true
24
+
25
+ Optimizer:
26
+ name: AdamW
27
+ lr: 0.000325
28
+ weight_decay: 0.05
29
+ filter_bias_and_bn: True
30
+
31
+ LRScheduler:
32
+ name: OneCycleLR
33
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
34
+ cycle_momentum: False
35
+ Architecture:
36
+ model_type: rec
37
+ algorithm: BGPD
38
+ in_channels: 3
39
+ Transform:
40
+ Encoder:
41
+ name: SVTRv2LNConvTwo33
42
+ use_pos_embed: False
43
+ out_channels: 256
44
+ dims: [128, 256, 384]
45
+ depths: [6, 6, 6]
46
+ num_heads: [4, 8, 12]
47
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
48
+ local_k: [[5, 5], [5, 5], [-1, -1]]
49
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
50
+ last_stage: false
51
+ feat2d: True
52
+ Decoder:
53
+ name: GTCDecoder
54
+ infer_gtc: True
55
+ detach: False
56
+ gtc_decoder:
57
+ name: SMTRDecoder
58
+ num_layer: 1
59
+ ds: True
60
+ max_len: *max_text_length
61
+ next_mode: &next True
62
+ sub_str_len: &subsl 5
63
+ infer_aug: True
64
+ ctc_decoder:
65
+ name: RCTCDecoder
66
+
67
+ Loss:
68
+ name: GTCLoss
69
+ ctc_weight: 0.1
70
+ gtc_loss:
71
+ name: SMTRLoss
72
+
73
+ PostProcess:
74
+ name: GTCLabelDecode
75
+ gtc_label_decode:
76
+ name: SMTRLabelDecode
77
+ next_mode: *next
78
+ character_dict_path: *character_dict_path
79
+ use_space_char: *use_space_char
80
+ only_gtc: True
81
+
82
+ Metric:
83
+ name: RecGTCMetric
84
+ main_indicator: acc
85
+ is_filter: True
86
+
87
+ Train:
88
+ dataset:
89
+ name: RatioDataSetTVResize
90
+ ds_width: True
91
+ padding: false
92
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
93
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
94
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
95
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
96
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
97
+ ]
98
+ transforms:
99
+ - DecodeImagePIL: # load image
100
+ img_mode: RGB
101
+ - PARSeqAugPIL:
102
+ - SMTRLabelEncode: # Class handling label
103
+ sub_str_len: *subsl
104
+ character_dict_path: *character_dict_path
105
+ use_space_char: *use_space_char
106
+ max_text_length: *max_text_length
107
+ - KeepKeys:
108
+ keep_keys: ['image', 'label', 'label_subs', 'label_next', 'length_subs',
109
+ 'label_subs_pre', 'label_next_pre', 'length_subs_pre', 'length'] # dataloader will return list in this order
110
+ sampler:
111
+ name: RatioSampler
112
+ scales: [[128, 32]] # w, h
113
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
114
+ first_bs: &bs 256
115
+ fix_bs: false
116
+ divided_factor: [4, 16] # w, h
117
+ is_training: True
118
+ loader:
119
+ shuffle: True
120
+ batch_size_per_card: *bs
121
+ drop_last: True
122
+ max_ratio: &max_ratio 12
123
+ num_workers: 4
124
+
125
+ Eval:
126
+ dataset:
127
+ name: SimpleDataSet
128
+ data_dir: ../ltb/
129
+ label_file_list: ['../ltb/ultra_long_70_list.txt']
130
+ transforms:
131
+ - DecodeImage: # load image
132
+ img_mode: BGR
133
+ channel_first: False
134
+ - GTCLabelEncode: # Class handling label
135
+ gtc_label_encode:
136
+ name: ARLabelEncode
137
+ character_dict_path: *character_dict_path
138
+ use_space_char: *use_space_char
139
+ max_text_length: 200
140
+ - SliceResize:
141
+ image_shape: [3, 32, 128]
142
+ padding: False
143
+ max_ratio: 12
144
+ - KeepKeys:
145
+ keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
146
+ loader:
147
+ shuffle: False
148
+ drop_last: False
149
+ batch_size_per_card: 1
150
+ num_workers: 2
configs/rec/gtc/svtrv2_lnconv_smtr_gtc_stream.yml ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 60
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/svtrv2_lnconv_smtr_gtc_stream
7
+ save_epoch_step: [15, 1]
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img:
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
18
+ # ./tools/utils/ppocr_keys_v1.txt # ch
19
+ max_text_length: &max_text_length 25
20
+ use_space_char: &use_space_char False
21
+ save_res_path: ./output/rec/predicts_smtr.txt
22
+ use_amp: True
23
+ distributed: true
24
+ grad_clip_val: 20
25
+
26
+ Optimizer:
27
+ name: AdamW
28
+ lr: 0.00065
29
+ weight_decay: 0.05
30
+ filter_bias_and_bn: True
31
+
32
+ LRScheduler:
33
+ name: OneCycleLR
34
+ warmup_epoch: 5 # pct_start 0.075*20 = 1.5ep
35
+ cycle_momentum: False
36
+
37
+ Architecture:
38
+ model_type: rec
39
+ algorithm: BGPD
40
+ in_channels: 3
41
+ Transform:
42
+ Encoder:
43
+ name: SVTRv2LNConvTwo33
44
+ use_pos_embed: False
45
+ out_channels: 256
46
+ dims: [128, 256, 384]
47
+ depths: [6, 6, 6]
48
+ num_heads: [4, 8, 12]
49
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
50
+ local_k: [[5, 5], [5, 5], [-1, -1]]
51
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
52
+ last_stage: false
53
+ feat2d: True
54
+ Decoder:
55
+ name: GTCDecoder
56
+ infer_gtc: True
57
+ detach: False
58
+ gtc_decoder:
59
+ name: SMTRDecoder
60
+ num_layer: 1
61
+ ds: True
62
+ max_len: *max_text_length
63
+ next_mode: &next True
64
+ sub_str_len: &subsl 5
65
+ infer_aug: False
66
+ ctc_decoder:
67
+ name: RCTCDecoder
68
+
69
+ Loss:
70
+ name: GTCLoss
71
+ ctc_weight: 0.25
72
+ gtc_loss:
73
+ name: SMTRLoss
74
+
75
+ PostProcess:
76
+ name: GTCLabelDecode
77
+ gtc_label_decode:
78
+ name: SMTRLabelDecode
79
+ next_mode: *next
80
+ character_dict_path: *character_dict_path
81
+ use_space_char: *use_space_char
82
+ only_gtc: True
83
+
84
+ Metric:
85
+ name: RecMetric
86
+ main_indicator: acc
87
+ is_filter: True
88
+ stream: True
89
+
90
+ Train:
91
+ dataset:
92
+ name: RatioDataSetTVResize
93
+ ds_width: True
94
+ padding: false
95
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
96
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
97
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
98
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
99
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
100
+ ]
101
+ transforms:
102
+ - DecodeImagePIL: # load image
103
+ img_mode: RGB
104
+ - PARSeqAugPIL:
105
+ - SMTRLabelEncode: # Class handling label
106
+ sub_str_len: *subsl
107
+ character_dict_path: *character_dict_path
108
+ use_space_char: *use_space_char
109
+ max_text_length: *max_text_length
110
+ - KeepKeys:
111
+ keep_keys: ['image', 'label', 'label_subs', 'label_next', 'length_subs',
112
+ 'label_subs_pre', 'label_next_pre', 'length_subs_pre', 'length'] # dataloader will return list in this order
113
+ sampler:
114
+ name: RatioSampler
115
+ scales: [[128, 32]] # w, h
116
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
117
+ first_bs: &bs 256
118
+ fix_bs: false
119
+ divided_factor: [4, 16] # w, h
120
+ is_training: True
121
+ loader:
122
+ shuffle: True
123
+ batch_size_per_card: *bs
124
+ drop_last: True
125
+ max_ratio: &max_ratio 12
126
+ num_workers: 4
127
+
128
+ Eval:
129
+ dataset:
130
+ name: SimpleDataSet
131
+ data_dir: ../ltb/
132
+ label_file_list: ['../ltb/ultra_long_70_list.txt']
133
+ transforms:
134
+ - DecodeImagePIL: # load image
135
+ img_mode: RGB
136
+ - GTCLabelEncode: # Class handling label
137
+ gtc_label_encode:
138
+ name: ARLabelEncode
139
+ character_dict_path: *character_dict_path
140
+ use_space_char: *use_space_char
141
+ max_text_length: *max_text_length
142
+ - SliceTVResize:
143
+ image_shape: [32, 128]
144
+ padding: False
145
+ max_ratio: 4
146
+ - KeepKeys:
147
+ keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
148
+ loader:
149
+ shuffle: False
150
+ drop_last: False
151
+ batch_size_per_card: 1
152
+ num_workers: 2
configs/rec/igtr/readme.md ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # IGTR
2
+
3
+ - [IGTR](#igtr)
4
+ - [1. Introduction](#1-introduction)
5
+ - [2. Environment](#2-environment)
6
+ - [Dataset Preparation](#dataset-preparation)
7
+ - [3. Model Training / Evaluation](#3-model-training--evaluation)
8
+ - [Citation](#citation)
9
+
10
+ <a name="1"></a>
11
+
12
+ ## 1. Introduction
13
+
14
+ Paper:
15
+
16
+ > [Instruction-Guided Scene Text Recognition](https://arxiv.org/abs/2401.17851),
17
+ > Yongkun Du, Zhineng Chen, Yuchen Su, Caiyan Jia, Yu-Gang Jiang,
18
+ > TPAMI
19
+
20
+ <a name="model"></a>
21
+ Multi-modal models have shown appealing performance in visual recognition tasks, as free-form text-guided training evokes the ability to understand fine-grained visual content. However, current models cannot be trivially applied to scene text recognition (STR) due to the compositional difference between natural and text images. We propose a novel instruction-guided scene text recognition (IGTR) paradigm that formulates STR as an instruction learning problem and understands text images by predicting character attributes, e.g., character frequency, position, etc. IGTR first devises $\\left \\langle condition,question,answer\\right \\rangle$ instruction triplets, providing rich and diverse descriptions of character attributes. To effectively learn these attributes through question-answering, IGTR develops a lightweight instruction encoder, a cross-modal feature fusion module and a multi-task answer head, which guides nuanced text image understanding. Furthermore, IGTR realizes different recognition pipelines simply by using different instructions, enabling a character-understanding-based text reasoning paradigm that differs from current methods considerably. Experiments on English and Chinese benchmarks show that IGTR outperforms existing models by significant margins, while maintaining a small model size and fast inference speed. Moreover, by adjusting the sampling of instructions, IGTR offers an elegant way to tackle the recognition of rarely appearing and morphologically similar characters, which were previous challenges.
22
+
23
+ <a name="model"></a>
24
+ The accuracy (%) and model files of IGTR on the public dataset of scene text recognition are as follows:
25
+
26
+ - Trained on Synth dataset(MJ+ST), test on Common Benchmarks, training and test datasets both from [PARSeq](https://github.com/baudm/parseq).
27
+
28
+ | Model | IC13<br/>857 | SVT | IIIT5k<br/>3000 | IC15<br/>1811 | SVTP | CUTE80 | Avg | Config&Model&Log |
29
+ | :-----: | :----------: | :--: | :-------------: | :-----------: | :--: | :----: | :---: | :---------------------------------------------------------------------------------------------: |
30
+ | IGTR-PD | 97.6 | 95.2 | 97.6 | 88.4 | 91.6 | 95.5 | 94.30 | [link](https://drive.google.com/drive/folders/1Pv0CW2hiWC_dIyaB74W1fsXqiX3z5yXA?usp=drive_link) |
31
+ | IGTR-AR | 98.6 | 95.7 | 98.2 | 88.4 | 92.4 | 95.5 | 94.78 | as above |
32
+
33
+ - Test on Union14M-L benchmark, from [Union14M](https://github.com/Mountchicken/Union14M/).
34
+
35
+ | Model | Curve | Multi-<br/>Oriented | Artistic | Contextless | Salient | Multi-<br/>word | General | Avg | Config&Model&Log |
36
+ | :-----: | :---: | :-----------------: | :------: | :---------: | :-----: | :-------------: | :-----: | :---: | :---------------------: |
37
+ | IGTR-PD | 76.9 | 30.6 | 59.1 | 63.3 | 77.8 | 62.5 | 66.7 | 62.40 | Same as the above table |
38
+ | IGTR-AR | 78.4 | 31.9 | 61.3 | 66.5 | 80.2 | 69.3 | 67.9 | 65.07 | as above |
39
+
40
+ - Trained on Union14M-L training dataset.
41
+
42
+ | Model | IC13<br/>857 | SVT | IIIT5k<br/>3000 | IC15<br/>1811 | SVTP | CUTE80 | Avg | Config&Model&Log |
43
+ | :----------: | :----------: | :--: | :-------------: | :-----------: | :--: | :----: | :---: | :---------------------------------------------------------------------------------------------: |
44
+ | IGTR-PD | 97.7 | 97.7 | 98.3 | 89.8 | 93.7 | 97.9 | 95.86 | [link](https://drive.google.com/drive/folders/1ZGlzDqEzjrBg8qG2wBkbOm3bLRzFbTzo?usp=drive_link) |
45
+ | IGTR-AR | 98.1 | 98.4 | 98.7 | 90.5 | 94.9 | 98.3 | 96.48 | as above |
46
+ | IGTR-PD-60ep | 97.9 | 98.3 | 99.2 | 90.8 | 93.7 | 97.6 | 96.24 | [link](https://drive.google.com/drive/folders/1ik4hxZDRsjU1RbCA19nwE45Kg1bCnMoa?usp=drive_link) |
47
+ | IGTR-AR-60ep | 98.4 | 98.1 | 99.3 | 91.5 | 94.3 | 97.6 | 96.54 | as above |
48
+ | IGTR-PD-PT | 98.6 | 98.0 | 99.1 | 91.7 | 96.8 | 99.0 | 97.20 | [link](https://drive.google.com/drive/folders/1QM0EWV66IfYI1G0Xm066V2zJA62hH6-1?usp=drive_link) |
49
+ | IGTR-AR-PT | 98.8 | 98.3 | 99.2 | 92.0 | 96.8 | 99.0 | 97.34 | as above |
50
+
51
+ | Model | Curve | Multi-<br/>Oriented | Artistic | Contextless | Salient | Multi-<br/>word | General | Avg | Config&Model&Log |
52
+ | :----------: | :---: | :-----------------: | :------: | :---------: | :-----: | :-------------: | :-----: | :---: | :---------------------: |
53
+ | IGTR-PD | 88.1 | 89.9 | 74.2 | 80.3 | 82.8 | 79.2 | 83.0 | 82.51 | Same as the above table |
54
+ | IGTR-AR | 90.4 | 91.2 | 77.0 | 82.4 | 84.7 | 84.0 | 84.4 | 84.86 | as above |
55
+ | IGTR-PD-60ep | 90.0 | 92.1 | 77.5 | 82.8 | 86.0 | 83.0 | 84.8 | 85.18 | Same as the above table |
56
+ | IGTR-AR-60ep | 91.0 | 93.0 | 78.7 | 84.6 | 87.3 | 84.8 | 85.6 | 86.43 | as above |
57
+ | IGTR-PD-PT | 92.4 | 92.1 | 80.7 | 83.6 | 87.7 | 86.9 | 85.0 | 86.92 | Same as the above table |
58
+ | IGTR-AR-PT | 93.0 | 92.9 | 81.3 | 83.4 | 88.6 | 88.7 | 85.6 | 87.65 | as above |
59
+
60
+ - Trained and test on Chinese dataset, from [Chinese Benckmark](https://github.com/FudanVI/benchmarking-chinese-text-recognition).
61
+
62
+ | Model | Scene | Web | Document | Handwriting | Avg | Config&Model&Log |
63
+ | :---------: | :---: | :--: | :------: | :---------: | :---: | :---------------------------------------------------------------------------------------------: |
64
+ | IGTR-PD | 73.1 | 74.8 | 98.6 | 52.5 | 74.75 | |
65
+ | IGTR-AR | 75.1 | 76.4 | 98.7 | 55.3 | 76.37 | |
66
+ | IGTR-PD-TS | 73.5 | 75.9 | 98.7 | 54.5 | 75.65 | [link](https://drive.google.com/drive/folders/1H3VRdGHjhawd6fkSC-qlBzVzvYYTpHRg?usp=drive_link) |
67
+ | IGTR-AR-TS | 75.6 | 77.0 | 98.8 | 57.3 | 77.17 | as above |
68
+ | IGTR-PD-Aug | 79.5 | 80.0 | 99.4 | 58.9 | 79.45 | [link](https://drive.google.com/drive/folders/1XFQkCILwcFwA7iYyQY9crnrouaI5sqcZ?usp=drive_link) |
69
+ | IGTR-AR-Aug | 82.0 | 81.7 | 99.5 | 63.8 | 81.74 | as above |
70
+
71
+ Download all Configs, Models, and Logs from [Google Drive](https://drive.google.com/drive/folders/1mSRDg9Mj5R6PspAdFGXZHDHTCQmjkd8d?usp=drive_link).
72
+
73
+ <a name="2"></a>
74
+
75
+ ## 2. Environment
76
+
77
+ - [PyTorch](http://pytorch.org/) version >= 1.13.0
78
+ - Python version >= 3.7
79
+
80
+ ```shell
81
+ git clone -b develop https://github.com/Topdu/OpenOCR.git
82
+ cd OpenOCR
83
+ # A100 Ubuntu 20.04 Cuda 11.8
84
+ conda create -n openocr python==3.8
85
+ conda activate openocr
86
+ conda install pytorch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 pytorch-cuda=11.8 -c pytorch -c nvidia
87
+ pip install -r requirements.txt
88
+ ```
89
+
90
+ #### Dataset Preparation
91
+
92
+ - [English dataset download](https://github.com/baudm/parseq)
93
+
94
+ - [Union14M-L-LMDB-Filtered download](https://drive.google.com/drive/folders/1OlDWJZgvd6s4S09S3IGeAI90jI0i7AB_?usp=sharing)
95
+
96
+ - [Chinese dataset download](https://github.com/fudanvi/benchmarking-chinese-text-recognition#download)
97
+
98
+ The expected filesystem structure is as follows:
99
+
100
+ ```
101
+ benchmark_bctr
102
+ ├── benchmark_bctr_test
103
+ │ ├── document_test
104
+ │ ├── handwriting_test
105
+ │ ├── scene_test
106
+ │ └── web_test
107
+ └── benchmark_bctr_train
108
+ ├── document_train
109
+ ├── handwriting_train
110
+ ├── scene_train
111
+ └── web_train
112
+ evaluation
113
+ ├── CUTE80
114
+ ├── IC13_857
115
+ ├── IC15_1811
116
+ ├── IIIT5k
117
+ ├── SVT
118
+ └── SVTP
119
+ OpenOCR
120
+ synth
121
+ ├── MJ
122
+ │ ├── test
123
+ │ ├── train
124
+ │ └── val
125
+ └── ST
126
+ test # from PARSeq
127
+ ├── ArT
128
+ ├── COCOv1.4
129
+ ├── CUTE80
130
+ ├── IC13_1015
131
+ ├── IC13_1095
132
+ ├── IC13_857
133
+ ├── IC15_1811
134
+ ├── IC15_2077
135
+ ├── IIIT5k
136
+ ├── SVT
137
+ ├── SVTP
138
+ └── Uber
139
+ u14m # lmdb format
140
+ ├── artistic
141
+ ├── contextless
142
+ ├── curve
143
+ ├── general
144
+ ├── multi_oriented
145
+ ├── multi_words
146
+ └── salient
147
+ Union14M-L-LMDB-Filtered # lmdb format
148
+ ├── train_challenging
149
+ ├── train_easy
150
+ ├── train_hard
151
+ ├── train_medium
152
+ └── train_normal
153
+ ```
154
+
155
+ <a name="3"></a>
156
+
157
+ ## 3. Model Training / Evaluation
158
+
159
+ Training:
160
+
161
+ ```shell
162
+ # The configuration file is available from the link provided in the table above.
163
+ # Multi GPU training
164
+ CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 tools/train_rec.py --c PATH/svtr_base_igtr_XXX.yml
165
+ ```
166
+
167
+ Evaluation:
168
+
169
+ ```shell
170
+ # The configuration file is available from the link provided in the table above.
171
+ # en
172
+ python tools/eval_rec_all_en.py --c PATH/svtr_base_igtr_syn.yml
173
+ # ch
174
+ python tools/eval_rec_all_ch.py --c PATH/svtr_base_igtr_ch_aug.yml
175
+ ```
176
+
177
+ ## Citation
178
+
179
+ If you find our method useful for your reserach, please cite:
180
+
181
+ ```bibtex
182
+ @article{Du2024IGTR,
183
+ title = {Instruction-Guided Scene Text Recognition},
184
+ author = {Du, Yongkun and Chen, Zhineng and Su, Yuchen and Jia, Caiyan and Jiang, Yu-Gang},
185
+ journal = {CoRR},
186
+ eprinttype = {arXiv},
187
+ primaryClass={cs.CV},
188
+ volume = {abs/2401.17851},
189
+ year = {2024},
190
+ url = {https://arxiv.org/abs/2401.17851}
191
+ }
192
+ ```
configs/rec/igtr/svtr_base_ds_igtr.yml ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtr_base_igtr
7
+ save_epoch_step: [15, 1]
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img:
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path
18
+ # ./tools/utils/EN_symbol_dict.txt # 96en
19
+ # ./tools/utils/ppocr_keys_v1.txt # ch
20
+ max_text_length: &max_text_length 25
21
+ use_space_char: &use_space_char False
22
+ save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_igtr.txt
23
+ use_amp: True
24
+
25
+ Optimizer:
26
+ name: AdamW
27
+ lr: 0.0005 # 2gpus 384bs/gpu
28
+ weight_decay: 0.05
29
+ filter_bias_and_bn: True
30
+
31
+ LRScheduler:
32
+ name: OneCycleLR
33
+ warmup_epoch: 1.5
34
+ cycle_momentum: False
35
+
36
+ Architecture:
37
+ model_type: rec
38
+ algorithm: IGTR
39
+ in_channels: 3
40
+ Transform:
41
+ Encoder:
42
+ name: SVTRNet2DPos
43
+ img_size: [32, -1]
44
+ out_char_num: 25
45
+ out_channels: 256
46
+ patch_merging: 'Conv'
47
+ embed_dim: [128, 256, 384]
48
+ depth: [6, 6, 6]
49
+ num_heads: [4, 8, 12]
50
+ mixer: ['ConvB','ConvB','ConvB','ConvB','ConvB','ConvB', 'ConvB','ConvB', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
51
+ local_mixer: [[5, 5], [5, 5], [5, 5]]
52
+ last_stage: False
53
+ prenorm: True
54
+ use_first_sub: False
55
+ Decoder:
56
+ name: IGTRDecoder
57
+ dim: 384
58
+ num_layer: 1
59
+ ar: False
60
+ refine_iter: 0
61
+ # next_pred: True
62
+ next_pred: False
63
+ pos2d: True
64
+ ds: True
65
+ # pos_len: False
66
+ # rec_layer: 1
67
+
68
+
69
+ Loss:
70
+ name: IGTRLoss
71
+
72
+ PostProcess:
73
+ name: IGTRLabelDecode
74
+ character_dict_path: *character_dict_path
75
+ use_space_char: *use_space_char
76
+
77
+ Metric:
78
+ name: RecMetric
79
+ main_indicator: acc
80
+
81
+ Train:
82
+ dataset:
83
+ name: RatioDataSet
84
+ ds_width: True
85
+ padding: &padding False
86
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
87
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
88
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
89
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
90
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
91
+ ]
92
+ transforms:
93
+ - DecodeImage: # load image
94
+ img_mode: BGR
95
+ channel_first: False
96
+ - PARSeqAug:
97
+ - IGTRLabelEncode: # Class handling label
98
+ k: 8
99
+ prompt_error: False
100
+ character_dict_path: *character_dict_path
101
+ use_space_char: *use_space_char
102
+ max_text_length: *max_text_length
103
+ - KeepKeys:
104
+ keep_keys: ['image', 'label', 'prompt_pos_idx_list',
105
+ 'prompt_char_idx_list', 'ques_pos_idx_list', 'ques1_answer_list',
106
+ 'ques2_char_idx_list', 'ques2_answer_list', 'ques3_answer', 'ques4_char_num_list',
107
+ 'ques_len_list', 'ques2_len_list', 'prompt_len_list', 'length'] # dataloader will return list in this order
108
+ sampler:
109
+ name: RatioSampler
110
+ scales: [[128, 32]] # w, h
111
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
112
+ first_bs: &bs 384
113
+ fix_bs: false
114
+ divided_factor: [4, 16] # w, h
115
+ is_training: True
116
+ loader:
117
+ shuffle: True
118
+ batch_size_per_card: *bs
119
+ drop_last: True
120
+ max_ratio: &max_ratio 4
121
+ num_workers: 4
122
+
123
+ Eval:
124
+ dataset:
125
+ name: RatioDataSet
126
+ ds_width: True
127
+ padding: *padding
128
+ data_dir_list: ['../evaluation/CUTE80',
129
+ '../evaluation/IC13_857',
130
+ '../evaluation/IC15_1811',
131
+ '../evaluation/IIIT5k',
132
+ '../evaluation/SVT',
133
+ '../evaluation/SVTP']
134
+ transforms:
135
+ - DecodeImage: # load image
136
+ img_mode: BGR
137
+ channel_first: False
138
+ - ARLabelEncode: # Class handling label
139
+ character_dict_path: *character_dict_path
140
+ use_space_char: *use_space_char
141
+ max_text_length: *max_text_length
142
+ - KeepKeys:
143
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
144
+ sampler:
145
+ name: RatioSampler
146
+ scales: [[128, 32]] # w, h
147
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
148
+ first_bs: 256
149
+ fix_bs: false
150
+ divided_factor: [4, 16] # w, h
151
+ is_training: False
152
+ loader:
153
+ shuffle: False
154
+ drop_last: False
155
+ batch_size_per_card: 256
156
+ max_ratio: *max_ratio
157
+ num_workers: 4
configs/rec/lister/focalsvtr_lister_wo_fem_maxratio12.yml ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/focalsvtr_lister_wo_fem_maxratio12/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: &max_text_length 25
17
+ use_space_char: &use_space_char False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_focalsvtr_lister_wo_fem_maxratio12.txt
19
+ use_amp: True
20
+ grad_clip_val: 20
21
+
22
+ Optimizer:
23
+ name: AdamW
24
+ lr: 0.00065
25
+ weight_decay: 0.05
26
+ filter_bias_and_bn: True
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
31
+ cycle_momentum: False
32
+
33
+ Architecture:
34
+ model_type: rec
35
+ algorithm: LISTER
36
+ Transform:
37
+ Encoder:
38
+ name: FocalSVTR
39
+ img_size: [32, 128]
40
+ depths: [6, 6, 9]
41
+ embed_dim: 96
42
+ sub_k: [[1, 1], [2, 1], [1, 1]]
43
+ focal_levels: [3, 3, 3]
44
+ last_stage: False
45
+ feat2d: True
46
+ Decoder:
47
+ name: LISTERDecoder
48
+ detach_grad: False
49
+ attn_scaling: True
50
+ use_fem: False
51
+
52
+ Loss:
53
+ name: LISTERLoss
54
+
55
+ PostProcess:
56
+ name: LISTERLabelDecode
57
+
58
+ Metric:
59
+ name: RecMetric
60
+ main_indicator: acc
61
+ is_filter: True
62
+
63
+ Train:
64
+ dataset:
65
+ name: RatioDataSetTVResize
66
+ ds_width: True
67
+ padding: False
68
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_filter_train_challenging',
69
+ '../Union14M-L-LMDB-Filtered/filter_filter_train_hard',
70
+ '../Union14M-L-LMDB-Filtered/filter_filter_train_medium',
71
+ '../Union14M-L-LMDB-Filtered/filter_filter_train_normal',
72
+ '../Union14M-L-LMDB-Filtered/filter_filter_train_easy',
73
+ ]
74
+ transforms:
75
+ - DecodeImagePIL: # load image
76
+ img_mode: RGB
77
+ - PARSeqAugPIL:
78
+ - EPLabelEncode: # Class handling label
79
+ character_dict_path: *character_dict_path
80
+ use_space_char: *use_space_char
81
+ max_text_length: *max_text_length
82
+ - KeepKeys:
83
+ keep_keys: ['image', 'label', 'length']
84
+ sampler:
85
+ name: RatioSampler
86
+ scales: [[128, 32]] # w, h
87
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
88
+ first_bs: &bs 256
89
+ fix_bs: false
90
+ divided_factor: [4, 16] # w, h
91
+ is_training: True
92
+ loader:
93
+ shuffle: True
94
+ batch_size_per_card: *bs
95
+ drop_last: True
96
+ max_ratio: 12
97
+ num_workers: 4
98
+
99
+ Eval:
100
+ dataset:
101
+ name: RatioDataSetTVResize
102
+ ds_width: True
103
+ padding: False
104
+ data_dir_list: ['../evaluation/CUTE80',
105
+ '../evaluation/IC13_857',
106
+ '../evaluation/IC15_1811',
107
+ '../evaluation/IIIT5k',
108
+ '../evaluation/SVT',
109
+ '../evaluation/SVTP',
110
+ ]
111
+ transforms:
112
+ - DecodeImagePIL: # load image
113
+ img_mode: RGB
114
+ - EPLabelEncode: # Class handling label
115
+ character_dict_path: *character_dict_path
116
+ use_space_char: *use_space_char
117
+ max_text_length: *max_text_length
118
+ - KeepKeys:
119
+ keep_keys: ['image', 'label', 'length']
120
+ sampler:
121
+ name: RatioSampler
122
+ scales: [[128, 32]] # w, h
123
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
124
+ first_bs: 256
125
+ fix_bs: false
126
+ divided_factor: [4, 16] # w, h
127
+ is_training: False
128
+ loader:
129
+ shuffle: False
130
+ drop_last: False
131
+ batch_size_per_card: *bs
132
+ max_ratio: 12
133
+ num_workers: 4
configs/rec/lister/svtrv2_lister_wo_fem_maxratio12.yml ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_lister_wo_fem_maxratio12/
7
+ eval_epoch_step: [0, 1]
8
+ eval_batch_step: [0, 500]
9
+ cal_metric_during_train: True
10
+ pretrained_model:
11
+ checkpoints:
12
+ use_tensorboard: false
13
+ infer_img:
14
+ # for data or label process
15
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
16
+ max_text_length: &max_text_length 25
17
+ use_space_char: &use_space_char False
18
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_lister_wo_fem_maxratio12.txt
19
+ use_amp: True
20
+ grad_clip_val: 20
21
+
22
+ Optimizer:
23
+ name: AdamW
24
+ lr: 0.000325
25
+ weight_decay: 0.05
26
+ filter_bias_and_bn: True
27
+
28
+ LRScheduler:
29
+ name: OneCycleLR
30
+
31
+ warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
32
+ cycle_momentum: False
33
+
34
+ Architecture:
35
+ model_type: rec
36
+ algorithm: LISTER
37
+ Transform:
38
+ Encoder:
39
+ name: SVTRv2LNConvTwo33
40
+ use_pos_embed: False
41
+ out_channels: 256
42
+ dims: [128, 256, 384]
43
+ depths: [6, 6, 6]
44
+ num_heads: [4, 8, 12]
45
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
46
+ local_k: [[5, 5], [5, 5], [-1, -1]]
47
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
48
+ last_stage: false
49
+ feat2d: True
50
+ Decoder:
51
+ name: LISTERDecoder
52
+ detach_grad: False
53
+ attn_scaling: True
54
+ use_fem: False
55
+
56
+ Loss:
57
+ name: LISTERLoss
58
+
59
+ PostProcess:
60
+ name: LISTERLabelDecode
61
+
62
+ Metric:
63
+ name: RecMetric
64
+ main_indicator: acc
65
+ is_filter: True
66
+
67
+ Train:
68
+ dataset:
69
+ name: RatioDataSetTVResize
70
+ ds_width: True
71
+ padding: False
72
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
73
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
74
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
75
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
76
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
77
+ ]
78
+ transforms:
79
+ - DecodeImagePIL: # load image
80
+ img_mode: RGB
81
+ - PARSeqAugPIL:
82
+ - EPLabelEncode: # Class handling label
83
+ character_dict_path: *character_dict_path
84
+ use_space_char: *use_space_char
85
+ max_text_length: *max_text_length
86
+ - KeepKeys:
87
+ keep_keys: ['image', 'label', 'length']
88
+ sampler:
89
+ name: RatioSampler
90
+ scales: [[128, 32]] # w, h
91
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
92
+ first_bs: &bs 128
93
+ fix_bs: false
94
+ divided_factor: [4, 16] # w, h
95
+ is_training: True
96
+ loader:
97
+ shuffle: True
98
+ batch_size_per_card: *bs
99
+ drop_last: True
100
+ max_ratio: 12
101
+ num_workers: 4
102
+
103
+ Eval:
104
+ dataset:
105
+ name: RatioDataSetTVResize
106
+ ds_width: True
107
+ padding: False
108
+ data_dir_list: ['../evaluation/CUTE80',
109
+ '../evaluation/IC13_857',
110
+ '../evaluation/IC15_1811',
111
+ '../evaluation/IIIT5k',
112
+ '../evaluation/SVT',
113
+ '../evaluation/SVTP',
114
+ ]
115
+ transforms:
116
+ - DecodeImagePIL: # load image
117
+ img_mode: RGB
118
+ - EPLabelEncode: # Class handling label
119
+ character_dict_path: *character_dict_path
120
+ use_space_char: *use_space_char
121
+ max_text_length: *max_text_length
122
+
123
+ - KeepKeys:
124
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
125
+ sampler:
126
+ name: RatioSampler
127
+ scales: [[128, 32]] # w, h
128
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
129
+ first_bs: 256
130
+ fix_bs: false
131
+ divided_factor: [4, 16] # w, h
132
+ is_training: False
133
+ loader:
134
+ shuffle: False
135
+ drop_last: False
136
+ batch_size_per_card: *bs
137
+ max_ratio: 12
138
+ num_workers: 4
configs/rec/lpv/svtr_base_lpv.yml ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtr_base_lpv/
7
+ save_epoch_step: [15, 1]
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ # ./output/rec/u14m_filter/svtr_base_lpv_wo_glrm/best.pth
14
+ checkpoints:
15
+ use_tensorboard: false
16
+ infer_img:
17
+ # for data or label process
18
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
19
+ # ./tools/utils/ppocr_keys_v1.txt # ch
20
+ max_text_length: &max_text_length 25
21
+ use_space_char: &use_space_char False
22
+ save_res_path: ./output/rec/u14m_filter/predicts_svtr_lpv.txt
23
+ use_amp: True
24
+ grad_clip_val: 20
25
+
26
+ Optimizer:
27
+ name: Adam
28
+ lr: 0.0001 # for 4gpus bs128/gpu
29
+ weight_decay: 0.0
30
+ filter_bias_and_bn: False
31
+ betas: [0.9, 0.99]
32
+
33
+ LRScheduler:
34
+ name: MultiStepLR
35
+ milestones: [12]
36
+ gamma: 0.1
37
+
38
+ Architecture:
39
+ model_type: rec
40
+ algorithm: LPV
41
+ in_channels: 3
42
+ Transform:
43
+ Encoder:
44
+ name: SVTRNet
45
+ img_size: [32, 128]
46
+ out_char_num: 25
47
+ out_channels: 256
48
+ patch_merging: 'Conv'
49
+ embed_dim: [128, 256, 384]
50
+ depth: [6, 6, 6]
51
+ num_heads: [4, 8, 12]
52
+ mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
53
+ local_mixer: [[5, 5], [5, 5], [5, 5]]
54
+ sub_k: [[1, 1], [1, 1]]
55
+ feature2d: True
56
+ last_stage: False
57
+ prenorm: True
58
+ Decoder:
59
+ name: LPVDecoder
60
+ num_layer: 3
61
+ max_len: *max_text_length
62
+ use_mask: True
63
+ dim_feedforward: 1536
64
+ nhead: 12
65
+ dropout: 0.1
66
+ trans_layer: 3
67
+
68
+ Loss:
69
+ name: LPVLoss
70
+
71
+ PostProcess:
72
+ name: ARLabelDecode
73
+ character_dict_path: *character_dict_path
74
+ use_space_char: *use_space_char
75
+
76
+ Metric:
77
+ name: RecMetric
78
+ main_indicator: acc
79
+ is_filter: True
80
+
81
+ Train:
82
+ dataset:
83
+ name: LMDBDataSet
84
+ data_dir: ../Union14M-L-LMDB-Filtered
85
+ transforms:
86
+ - DecodeImagePIL: # load image
87
+ img_mode: RGB
88
+ - PARSeqAugPIL:
89
+ - ARLabelEncode: # Class handling label
90
+ character_dict_path: *character_dict_path
91
+ use_space_char: *use_space_char
92
+ max_text_length: *max_text_length
93
+ - RecTVResize:
94
+ image_shape: [32, 128]
95
+ padding: False
96
+ - KeepKeys:
97
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
98
+ loader:
99
+ shuffle: True
100
+ batch_size_per_card: 128
101
+ drop_last: True
102
+ num_workers: 4
103
+
104
+ Eval:
105
+ dataset:
106
+ name: LMDBDataSet
107
+ data_dir: ../evaluation/
108
+ transforms:
109
+ - DecodeImagePIL: # load image
110
+ img_mode: RGB
111
+ - ARLabelEncode: # Class handling label
112
+ character_dict_path: *character_dict_path
113
+ use_space_char: *use_space_char
114
+ max_text_length: *max_text_length
115
+ - RecTVResize:
116
+ image_shape: [32, 128]
117
+ padding: False
118
+ - KeepKeys:
119
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
120
+ loader:
121
+ shuffle: False
122
+ drop_last: False
123
+ batch_size_per_card: 128
124
+ num_workers: 4
configs/rec/lpv/svtr_base_lpv_wo_glrm.yml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtr_base_lpv_wo_glrm/
7
+ save_epoch_step: [15, 1]
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ checkpoints:
14
+ use_tensorboard: false
15
+ infer_img:
16
+ # for data or label process
17
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
18
+ # ./tools/utils/ppocr_keys_v1.txt # ch
19
+ max_text_length: &max_text_length 25
20
+ use_space_char: &use_space_char False
21
+ save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_lpv_wo_glrm.txt
22
+ use_amp: True
23
+ grad_clip_val: 20
24
+
25
+ Optimizer:
26
+ name: Adam
27
+ lr: 0.0001 # for 4gpus bs128/gpu
28
+ weight_decay: 0.0
29
+ filter_bias_and_bn: False
30
+ betas: [0.9, 0.99]
31
+
32
+ LRScheduler:
33
+ name: MultiStepLR
34
+ milestones: [12]
35
+ gamma: 0.1
36
+
37
+ Architecture:
38
+ model_type: rec
39
+ algorithm: LPV
40
+ in_channels: 3
41
+ Transform:
42
+ Encoder:
43
+ name: SVTRNet
44
+ img_size: [32, 128]
45
+ out_char_num: 25
46
+ out_channels: 256
47
+ patch_merging: 'Conv'
48
+ embed_dim: [128, 256, 384]
49
+ depth: [6, 6, 6]
50
+ num_heads: [4, 8, 12]
51
+ mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
52
+ local_mixer: [[5, 5], [5, 5], [5, 5]]
53
+ sub_k: [[1, 1], [1, 1]]
54
+ feature2d: True
55
+ last_stage: False
56
+ prenorm: True
57
+ Decoder:
58
+ name: LPVDecoder
59
+ num_layer: 3
60
+ max_len: *max_text_length
61
+ use_mask: False
62
+ dim_feedforward: 1536
63
+ nhead: 12
64
+ dropout: 0.1
65
+ trans_layer: 3
66
+
67
+ Loss:
68
+ name: LPVLoss
69
+
70
+ PostProcess:
71
+ name: ARLabelDecode
72
+ character_dict_path: *character_dict_path
73
+ use_space_char: *use_space_char
74
+
75
+ Metric:
76
+ name: RecMetric
77
+ main_indicator: acc
78
+ is_filter: True
79
+
80
+ Train:
81
+ dataset:
82
+ name: LMDBDataSet
83
+ data_dir: ../Union14M-L-LMDB-Filtered
84
+ transforms:
85
+ - DecodeImagePIL: # load image
86
+ img_mode: RGB
87
+ - PARSeqAugPIL:
88
+ - ARLabelEncode: # Class handling label
89
+ character_dict_path: *character_dict_path
90
+ use_space_char: *use_space_char
91
+ max_text_length: *max_text_length
92
+ - RecTVResize:
93
+ image_shape: [32, 128]
94
+ padding: False
95
+ - KeepKeys:
96
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
97
+ loader:
98
+ shuffle: True
99
+ batch_size_per_card: 128
100
+ drop_last: True
101
+ num_workers: 4
102
+
103
+ Eval:
104
+ dataset:
105
+ name: LMDBDataSet
106
+ data_dir: ../evaluation/
107
+ transforms:
108
+ - DecodeImagePIL: # load image
109
+ img_mode: RGB
110
+ - ARLabelEncode: # Class handling label
111
+ character_dict_path: *character_dict_path
112
+ use_space_char: *use_space_char
113
+ max_text_length: *max_text_length
114
+ - RecTVResize:
115
+ image_shape: [32, 128]
116
+ padding: False
117
+ - KeepKeys:
118
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
119
+ loader:
120
+ shuffle: False
121
+ drop_last: False
122
+ batch_size_per_card: 128
123
+ num_workers: 4
configs/rec/lpv/svtrv2_lpv.yml ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ device: gpu
3
+ epoch_num: 20
4
+ log_smooth_window: 20
5
+ print_batch_step: 10
6
+ output_dir: ./output/rec/u14m_filter/svtrv2_lpv/
7
+ save_epoch_step: [15, 1]
8
+ # evaluation is run every 2000 iterations
9
+ eval_batch_step: [0, 500]
10
+ eval_epoch_step: [0, 1]
11
+ cal_metric_during_train: True
12
+ pretrained_model:
13
+ # ./output/rec/u14m_filter/svtrv2_lpv_wo_glrm/best.pth
14
+ checkpoints:
15
+ use_tensorboard: false
16
+ infer_img:
17
+ # for data or label process
18
+ character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
19
+ # ./tools/utils/ppocr_keys_v1.txt # ch
20
+ max_text_length: &max_text_length 25
21
+ use_space_char: &use_space_char False
22
+ save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_lpv.txt
23
+ use_amp: True
24
+ grad_clip_val: 20
25
+
26
+ Optimizer:
27
+ name: AdamW
28
+ lr: 0.000325 # for 4gpus bs128/gpu
29
+ weight_decay: 0.05
30
+ filter_bias_and_bn: True
31
+
32
+ LRScheduler:
33
+ name: OneCycleLR
34
+ warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
35
+ cycle_momentum: False
36
+
37
+ Architecture:
38
+ model_type: rec
39
+ algorithm: LPV
40
+ in_channels: 3
41
+ Transform:
42
+ Encoder:
43
+ name: SVTRv2LNConvTwo33
44
+ use_pos_embed: False
45
+ dims: [128, 256, 384]
46
+ depths: [6, 6, 6]
47
+ num_heads: [4, 8, 12]
48
+ mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
49
+ local_k: [[5, 5], [5, 5], [-1, -1]]
50
+ sub_k: [[1, 1], [2, 1], [-1, -1]]
51
+ last_stage: false
52
+ feat2d: True
53
+ Decoder:
54
+ name: LPVDecoder
55
+ num_layer: 3
56
+ max_len: *max_text_length
57
+ use_mask: True
58
+ dim_feedforward: 1536
59
+ nhead: 12
60
+ dropout: 0.1
61
+ trans_layer: 3
62
+
63
+ Loss:
64
+ name: LPVLoss
65
+
66
+ PostProcess:
67
+ name: ARLabelDecode
68
+ character_dict_path: *character_dict_path
69
+ use_space_char: *use_space_char
70
+
71
+ Metric:
72
+ name: RecMetric
73
+ main_indicator: acc
74
+ is_filter: True
75
+
76
+ Train:
77
+ dataset:
78
+ name: RatioDataSetTVResize
79
+ ds_width: True
80
+ padding: false
81
+ data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
82
+ '../Union14M-L-LMDB-Filtered/filter_train_hard',
83
+ '../Union14M-L-LMDB-Filtered/filter_train_medium',
84
+ '../Union14M-L-LMDB-Filtered/filter_train_normal',
85
+ '../Union14M-L-LMDB-Filtered/filter_train_easy',
86
+ ]
87
+ transforms:
88
+ - DecodeImagePIL: # load image
89
+ img_mode: RGB
90
+ - PARSeqAugPIL:
91
+ - ARLabelEncode: # Class handling label
92
+ character_dict_path: *character_dict_path
93
+ use_space_char: *use_space_char
94
+ max_text_length: *max_text_length
95
+ - KeepKeys:
96
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
97
+ sampler:
98
+ name: RatioSampler
99
+ scales: [[128, 32]] # w, h
100
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
101
+ first_bs: &bs 128
102
+ fix_bs: false
103
+ divided_factor: [4, 16] # w, h
104
+ is_training: True
105
+ loader:
106
+ shuffle: True
107
+ batch_size_per_card: *bs
108
+ drop_last: True
109
+ max_ratio: &max_ratio 4
110
+ num_workers: 4
111
+
112
+ Eval:
113
+ dataset:
114
+ name: RatioDataSetTVResize
115
+ ds_width: True
116
+ padding: False
117
+ data_dir_list: [
118
+ '../evaluation/CUTE80',
119
+ '../evaluation/IC13_857',
120
+ '../evaluation/IC15_1811',
121
+ '../evaluation/IIIT5k',
122
+ '../evaluation/SVT',
123
+ '../evaluation/SVTP',
124
+ ]
125
+ transforms:
126
+ - DecodeImagePIL: # load image
127
+ img_mode: RGB
128
+ - ARLabelEncode: # Class handling label
129
+ character_dict_path: *character_dict_path
130
+ use_space_char: *use_space_char
131
+ max_text_length: *max_text_length
132
+ - KeepKeys:
133
+ keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
134
+ sampler:
135
+ name: RatioSampler
136
+ scales: [[128, 32]] # w, h
137
+ # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
138
+ first_bs: *bs
139
+ fix_bs: false
140
+ divided_factor: [4, 16] # w, h
141
+ is_training: False
142
+ loader:
143
+ shuffle: False
144
+ drop_last: False
145
+ batch_size_per_card: *bs
146
+ max_ratio: *max_ratio
147
+ num_workers: 4