Spaces:

topdu
/

OpenDoc-0.1B-Demo

Running

App Files Files Community

topdu commited on 9 days ago

Commit

c898ed6

1 Parent(s): d5923fc

add app

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +361 -0
configs/dataset/rec/evaluation.yaml +41 -0
configs/dataset/rec/ltb.yaml +9 -0
configs/dataset/rec/mjsynth.yaml +11 -0
configs/dataset/rec/openvino.yaml +25 -0
configs/dataset/rec/ost.yaml +17 -0
configs/dataset/rec/synthtext.yaml +7 -0
configs/dataset/rec/test.yaml +77 -0
configs/dataset/rec/textocr.yaml +13 -0
configs/dataset/rec/textocr_horizontal.yaml +13 -0
configs/dataset/rec/union14m_b.yaml +47 -0
configs/dataset/rec/union14m_l_filtered.yaml +35 -0
configs/det/dbnet/repvit_db.yml +171 -0
configs/rec/abinet/resnet45_trans_abinet_lang.yml +94 -0
configs/rec/abinet/resnet45_trans_abinet_wo_lang.yml +93 -0
configs/rec/abinet/svtrv2_abinet_lang.yml +130 -0
configs/rec/abinet/svtrv2_abinet_wo_lang.yml +128 -0
configs/rec/aster/resnet31_lstm_aster_tps_on.yml +93 -0
configs/rec/aster/svtrv2_aster.yml +127 -0
configs/rec/aster/svtrv2_aster_tps_on.yml +102 -0
configs/rec/autostr/autostr_lstm_aster_tps_on.yml +95 -0
configs/rec/busnet/svtrv2_busnet.yml +135 -0
configs/rec/busnet/svtrv2_busnet_pretraining.yml +134 -0
configs/rec/busnet/vit_busnet.yml +104 -0
configs/rec/busnet/vit_busnet_pretraining.yml +104 -0
configs/rec/cam/convnextv2_cam_tps_on.yml +118 -0
configs/rec/cam/convnextv2_tiny_cam_tps_on.yml +118 -0
configs/rec/cam/svtrv2_cam_tps_on.yml +123 -0
configs/rec/cdistnet/resnet45_trans_cdistnet.yml +93 -0
configs/rec/cdistnet/svtrv2_cdistnet.yml +139 -0
configs/rec/cppd/svtr_base_cppd.yml +123 -0
configs/rec/cppd/svtr_base_cppd_ch.yml +126 -0
configs/rec/cppd/svtr_base_cppd_h8.yml +123 -0
configs/rec/cppd/svtr_base_cppd_syn.yml +124 -0
configs/rec/cppd/svtrv2_cppd.yml +150 -0
configs/rec/dan/resnet45_fpn_dan.yml +98 -0
configs/rec/dan/svtrv2_dan.yml +130 -0
configs/rec/dptr/dptr_parseq_pretrain.yml +88 -0
configs/rec/focalsvtr/focalsvtr_ctc.yml +137 -0
configs/rec/gtc/svtrv2_lnconv_nrtr_gtc.yml +168 -0
configs/rec/gtc/svtrv2_lnconv_smtr_gtc_long_infer.yml +151 -0
configs/rec/gtc/svtrv2_lnconv_smtr_gtc_smtr_long.yml +150 -0
configs/rec/gtc/svtrv2_lnconv_smtr_gtc_stream.yml +152 -0
configs/rec/igtr/readme.md +192 -0
configs/rec/igtr/svtr_base_ds_igtr.yml +157 -0
configs/rec/lister/focalsvtr_lister_wo_fem_maxratio12.yml +133 -0
configs/rec/lister/svtrv2_lister_wo_fem_maxratio12.yml +138 -0
configs/rec/lpv/svtr_base_lpv.yml +124 -0
configs/rec/lpv/svtr_base_lpv_wo_glrm.yml +123 -0
configs/rec/lpv/svtrv2_lpv.yml +147 -0

app.py ADDED Viewed

	@@ -0,0 +1,361 @@

+import os
+import uuid
+import shutil
+import re
+import base64
+import gradio as gr
+from PIL import Image
+from tools.infer_doc import OpenDoc
+from tools.utils.logging import get_logger
+logger = get_logger(name='opendoc_gradio')
+# Initialize the pipeline
+pipeline: OpenDoc | None = None
+def get_pipeline(gpu_id: int) -> OpenDoc:
+    """获取或初始化OpenDoc流水线
+    Args:
+        gpu_id: GPU设备ID，-1表示使用CPU
+    Returns:
+        OpenDoc: 初始化好的OpenDoc实例
+    """
+    global pipeline
+    if pipeline is None:
+        logger.info(
+            f"Initializing OpenDoc pipeline on {'GPU ' + str(gpu_id) if gpu_id >= 0 else 'CPU'}..."
+        )
+        pipeline = OpenDoc(gpuId=gpu_id)
+    return pipeline
+# Ensure pipeline is initialized
+try:
+    current_pipeline = get_pipeline(0)
+except Exception as e:
+    raise e
+def process_image(image_path: str | None) -> tuple[Image.Image | None, str, str, str | None, str, str]:
+    """处理图片并进行OCR识别
+    Args:
+        image_path: 图片文件路径，None表示无图片
+    Returns:
+        tuple: (可视化图片, Markdown内容(base64图片), JSON内容, ZIP文件路径, 原始Markdown, Markdown内容(base64图片))
+    """
+    if image_path is None:
+        return None, '', '', None, '', ''
+    # Get original image name
+    base_name = os.path.splitext(os.path.basename(image_path))[0]
+    file_ext = os.path.splitext(image_path)[1] or '.jpg'
+    # Create a directory with image name for this request
+    output_base_dir = 'gradio_outputs'
+    os.makedirs(output_base_dir, exist_ok=True)
+    # Add timestamp to avoid conflicts if same filename is uploaded multiple times
+    timestamp = str(uuid.uuid4())[:8]
+    folder_name = f"{base_name}_{timestamp}"
+    tmp_dir = os.path.join(output_base_dir, folder_name)
+    os.makedirs(tmp_dir, exist_ok=True)
+    try:
+        # Copy and rename the input image
+        tmp_img_path = os.path.join(tmp_dir, f'{base_name}{file_ext}')
+        image = Image.open(image_path)
+        image.save(tmp_img_path)
+        # Predict
+        output = list(
+            current_pipeline.predict(tmp_img_path,
+                                     use_doc_orientation_classify=False,
+                                     use_doc_unwarping=False))
+        if not output:
+            return None, 'No results found.', '', None, '', ''
+        res = output[0]
+        # Save results
+        res.save_to_img(tmp_dir)
+        res.save_to_markdown(tmp_dir, pretty=True)
+        res.save_to_json(tmp_dir)
+        # Find the saved files
+        vis_img = None
+        for f in os.listdir(tmp_dir):
+            if 'layout_order_res' in f:
+                vis_img_path = os.path.join(tmp_dir, f)
+                vis_img = Image.open(vis_img_path)
+                break
+        markdown_content = ''
+        md_file_path = None
+        for f in os.listdir(tmp_dir):
+            if f.endswith('.md'):
+                md_file_path = os.path.join(tmp_dir, f)
+                with open(md_file_path, 'r', encoding='utf-8') as file:
+                    markdown_content = file.read()
+                break
+        # Convert relative image paths to base64 for proper display in Gradio
+        if markdown_content:
+            def replace_img_with_base64(match):
+                img_path = match.group(1)
+                full_img_path = os.path.join(tmp_dir, img_path)
+                if os.path.exists(full_img_path):
+                    try:
+                        with open(full_img_path, 'rb') as img_file:
+                            img_data = base64.b64encode(img_file.read()).decode('utf-8')
+                            # Determine image format
+                            ext = os.path.splitext(full_img_path)[1].lower()
+                            mime_type = 'image/jpeg' if ext in ['.jpg', '.jpeg'] else 'image/png'
+                            # Replace src with base64 data URL
+                            return match.group(0).replace(f'src="{img_path}"', f'src="data:{mime_type};base64,{img_data}"')
+                    except Exception as e:
+                        logger.warning(f'Failed to convert image {img_path} to base64: {e}')
+                return match.group(0)
+            # Find all img tags and replace their src
+            markdown_content_show = re.sub(r'<img[^>]*src="([^"]+)"[^>]*>', replace_img_with_base64, markdown_content)
+        else:
+            markdown_content_show = markdown_content
+        json_content = ''
+        json_file_path = None
+        for f in os.listdir(tmp_dir):
+            if f.endswith('.json'):
+                json_file_path = os.path.join(tmp_dir, f)
+                with open(json_file_path, 'r', encoding='utf-8') as file:
+                    json_content = file.read()
+                break
+        # Prepare all files in tmp_dir for download by creating a zip archive
+        zip_path = os.path.join(output_base_dir, f'{folder_name}.zip')
+        _ = shutil.make_archive(zip_path.replace('.zip', ''), 'zip', tmp_dir)
+        return vis_img, markdown_content_show, json_content, zip_path, markdown_content, markdown_content_show
+    except Exception as e:
+        logger.error(f'Prediction error: {str(e)}')
+        return None, f'Error during prediction: {str(e)}', '', None, '', ''
+# Custom CSS with adaptive colors
+custom_css = """
+body, .gradio-container {
+    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Noto Sans", Helvetica, Arial, sans-serif;
+}
+.app-header {
+    text-align: center;
+    max-width: 1200px;
+    margin: 20px auto !important;
+    padding: 20px;
+}
+.app-header h1 {
+    font-size: 2.5em;
+    font-weight: 700;
+    margin-bottom: 10px;
+}
+.app-header p {
+    font-size: 1.1em;
+    opacity: 0.7;
+    line-height: 1.6;
+}
+.quick-links {
+    text-align: center;
+    padding: 12px 0;
+    border: 1px solid var(--border-color-primary);
+    border-radius: 12px;
+    margin: 16px auto;
+    max-width: 1200px;
+    background: var(--background-fill-secondary);
+}
+.quick-links a {
+    margin: 0 16px;
+    font-size: 15px;
+    font-weight: 600;
+    color: var(--link-text-color);
+    text-decoration: none;
+    transition: all 0.3s ease;
+}
+.quick-links a:hover {
+    opacity: 0.8;
+    text-decoration: underline;
+}
+.upload-section {
+    border: 2px dashed var(--border-color-primary);
+    border-radius: 12px;
+    padding: 20px;
+    background: var(--background-fill-secondary);
+    transition: all 0.3s ease;
+}
+.upload-section:hover {
+    border-color: var(--color-accent);
+    background: var(--background-fill-primary);
+}
+#vis_output {
+    min-height: 400px;
+    border-radius: 12px;
+    overflow: hidden;
+}
+#md_preview {
+    max-height: 600px;
+    min-height: 200px;
+    overflow: auto;
+    padding: 20px;
+    background: var(--background-fill-primary);
+    border-radius: 12px;
+    box-shadow: var(--shadow-drop);
+}
+#md_preview img {
+    display: block;
+    margin: 16px auto;
+    max-width: 100%;
+    height: auto;
+    border-radius: 8px;
+}
+.notice {
+    margin: 20px auto;
+    max-width: 1200px;
+    padding: 16px 20px;
+    border-left: 4px solid var(--color-accent);
+    border-radius: 8px;
+    background: var(--background-fill-secondary);
+    font-size: 14px;
+    line-height: 1.8;
+}
+.notice strong {
+    font-weight: 700;
+    color: var(--color-accent);
+}
+.notice ul {
+    margin-top: 8px;
+    padding-left: 20px;
+}
+.notice li {
+    margin: 8px 0;
+}
+.gradio-button-primary {
+    font-weight: 600 !important;
+    transition: all 0.3s ease !important;
+}
+.gradio-button-primary:hover {
+    transform: translateY(-2px);
+    box-shadow: var(--shadow-drop-lg) !important;
+}
+"""
+# LaTeX delimiters for formula rendering
+LATEX_DELIMS = [
+    {"left": "$$", "right": "$$", "display": True},
+    {"left": "$", "right": "$", "display": False},
+    {"left": "\\(", "right": "\\)", "display": False},
+    {"left": "\\[", "right": "\\]", "display": True},
+]
+# Define the Gradio Interface
+def create_demo() -> gr.Blocks:
+    """创建Gradio演示界面
+    Returns:
+        gr.Blocks: Gradio Blocks应用实例
+    """
+    with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title='OpenDoc-0.1B Demo') as demo:
+        # Header
+        gr.HTML("""
+        <div class="app-header">
+            <h1>🚀 OpenDoc-0.1B</h1>
+            <p>Ultra-Lightweight Document Parsing System with 0.1B Parameters (built by <a href="https://github.com/Topdu/OpenOCR">OCR Team</a>, <a href="https://fvl.fudan.edu.cn">FVL Lab</a>)</p>
+            <p style="font-size: 0.95em; color: #888;">
+                Powered by <a href="https://www.paddleocr.ai/latest/version3.x/module_usage/layout_analysis.html" target="_blank">PP-DocLayoutV2</a> for layout analysis and <a href="https://arxiv.org/pdf/2512.21095" target="_blank">UniRec-0.1B</a> for unified recognition of text, formulas, and tables
+            </p>
+        </div>
+        """)
+        # Quick links
+        gr.HTML("""
+        <div class="quick-links">
+            <a href="https://github.com/Topdu/OpenOCR" target="_blank">📖 GitHub</a>
+            <a href="https://arxiv.org/pdf/2512.21095" target="_blank">📄 Paper</a>
+            <a href="https://huggingface.co/topdu/unirec-0.1b" target="_blank">🤗 Model</a>
+        </div>
+        """)
+        with gr.Row():
+            with gr.Column(scale=5, elem_classes=["upload-section"]):
+                input_img = gr.Image(type='filepath', label='📤 Upload Document Image', height=400)
+                gr.Markdown("""
+                ### 💡 Tips
+                - Supports Chinese and English documents
+                - Best for reports, papers, magazines, and complex layouts
+                - Handles text, formulas, tables, and images
+                """)
+                btn = gr.Button('🔍 Analyze Document', variant='primary', size='lg')
+                download_output = gr.File(label='📥 Download All Results (ZIP)', visible=True)
+            with gr.Column(scale=7):
+                with gr.Tabs():
+                    with gr.Tab('📝 Markdown Preview'):
+                        output_md = gr.Markdown(
+                            'Please upload an image and click "Analyze Document" to see results.',
+                            latex_delimiters=LATEX_DELIMS,
+                            elem_id='md_preview'
+                        )
+                    with gr.Tab('📊 Layout Visualization'):
+                        output_vis = gr.Image(type='pil', label='Layout Analysis Results', elem_id='vis_output')
+                    with gr.Tab('📄 Raw Markdown'):
+                        output_md_raw = gr.Code(
+                            label='Markdown Source',
+                            language='markdown',
+                            lines=20
+                        )
+                    with gr.Tab('📄 Raw Markdown with Base64 Images'):
+                        output_md_raw_with_base64 = gr.Code(
+                            label='Markdown Source',
+                            language='markdown',
+                            lines=20
+                        )
+                    with gr.Tab('🗂️ JSON Result'):
+                        output_json = gr.Code(label='Structured Data', language='json')
+        # Feature notice
+        gr.HTML("""
+        <div class="notice">
+            <strong>✨ Key Features:</strong>
+            <ul>
+                <li><strong>Ultra-lightweight:</strong> Only 0.1B parameters, fast inference speed</li>
+                <li><strong>High accuracy:</strong> Achieves 90.57% on OmniDocBench (v1.5)</li>
+                <li><strong>Unified recognition:</strong> Handles text, formulas, and tables in one model</li>
+                <li><strong>Rich output:</strong> Provides Markdown, JSON, and visualization results</li>
+            </ul>
+        </div>
+        """)
+        btn.click(
+            fn=process_image,
+            inputs=[input_img],
+            outputs=[output_vis, output_md, output_json, download_output, output_md_raw, output_md_raw_with_base64]
+        )
+    return demo
+if __name__ == '__main__':
+    demo = create_demo()
+    demo.queue(max_size=20).launch(
+        share=False
+    )

configs/dataset/rec/evaluation.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+root: ../evaluation
+task: str
+download_links:
+  # IC15_1811
+  - https://drive.usercontent.google.com/download?id=1eGY0kXNV1qVxeUpoGzs-ioUO-ky7msH6&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1BWv7aLoLAT7avY326gXP3GJF48UZpuBC&authuser=0&confirm=t
+  # SVT
+  - https://drive.usercontent.google.com/download?id=1ecEZ4cJ7dIbTCZRltE0s5KzUotQWagH-&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1OygBP7i9R-3Pwi6WodCcW31J8CUMugOJ&authuser=0&confirm=t
+  # IIIT5k
+  - https://drive.usercontent.google.com/download?id=1PJ9_IvIGZTS5hHdGLnpKuYKZcCO8jE0E&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=10P3MixSBt1v8k8_6aFfziC33Z5IlM6Uf&authuser=0&confirm=t
+  # IC13_857
+  - https://drive.usercontent.google.com/download?id=1-wMHOFBXJaOaY-UD00nDn6qw2s_8R4Vd&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1J1QCFtOFxFKiLJIgTqZ6eRo9Y5QGqHpA&authuser=0&confirm=t
+  # SVTP
+  - https://drive.usercontent.google.com/download?id=1kckwfZkdaHG8k_FW5IIJKUaYZkF21Hza&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1x61lm_ea7lvIdxNPMG-jy-5W0MxtdH0N&authuser=0&confirm=t
+  # CUTE80
+  - https://drive.usercontent.google.com/download?id=1Zv_91c81tinLy5Je89HPr-5wUSnqXKIB&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1OuJ6QoJ9AlyNHIM9j2WedAPxTnac7kyY&authuser=0&confirm=t
+filenames:
+  # IC15_1811
+  - ../evaluation/IC15_1811/data.mdb
+  - ../evaluation/IC15_1811/lock.mdb
+  # SVT
+  - ../evaluation/SVT/data.mdb
+  - ../evaluation/SVT/lock.mdb
+  # IIIT5k
+  - ../evaluation/IIIT5k/data.mdb
+  - ../evaluation/IIIT5k/lock.mdb
+  # IC13_857
+  - ../evaluation/IC13_857/data.mdb
+  - ../evaluation/IC13_857/lock.mdb
+  # SVTP
+  - ../evaluation/SVTP/data.mdb
+  - ../evaluation/SVTP/lock.mdb
+  # CUTE80
+  - ../evaluation/CUTE80/data.mdb
+  - ../evaluation/CUTE80/lock.mdb
+check_validity: true

configs/dataset/rec/ltb.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+root: ../ltb
+task: str
+download_links:
+  - https://drive.usercontent.google.com/download?id=16AEA1YGTsyVB44uEjKi4ZUV1snjCYBr4&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1xU4OStrOaI23bPG4flWAPWn2YrQe2bmY&authuser=0&confirm=t
+filenames:
+  - ../ltb/data.mdb
+  - ../ltb/lock.mdb
+check_validity: true

configs/dataset/rec/mjsynth.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+root: ../synth
+task: str
+download_links:
+  - https://drive.usercontent.google.com/download?id=1FIoplSFZ-BKQoRDHDXsVMKa844e-K8PD&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1eckTvaeRtlTZvbO2orrVz-cIuIk6i87K&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1PBXTf-2PnmEvJBsqzJqxxRwzhAZGTiMG&authuser=0&confirm=t
+filenames:
+  - ../synth/MJ_train.zip
+  - ../synth/MJ_val.zip
+  - ../synth/MJ_test.zip
+check_validity: true

configs/dataset/rec/openvino.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+root: ../OpenVINO
+task: str
+download_links:
+  # train_1
+  - https://drive.usercontent.google.com/download?id=1q23QAIRTyG0t-bBm4aAwRwiqB6VUfphw&authuser=0&confirm=
+  # train_2
+  - https://drive.usercontent.google.com/download?id=1AtbaJljM68cbZqi5lcM92d9VkQUCbSqI&authuser=0&confirm=
+  # train_5
+  - https://drive.usercontent.google.com/download?id=1dejstYnJ8_sESuO_uvwi__jT1B8gPxf3&authuser=0&confirm=t
+  # train_f
+  - https://drive.usercontent.google.com/download?id=1C4akchTc7-yi1OS_sJ3KP693UKcnecke&authuser=0&confirm=t
+  # validation
+  - https://drive.usercontent.google.com/download?id=17TRzSQhuK_juAxAv3KmX0y13pQP2cz6R&authuser=0&confirm=t
+filenames:
+  # train_1
+  - ../OpenVINO/train_1.zip
+  # train_2
+  - ../OpenVINO/train_2.zip
+  # train_5
+  - ../OpenVINO/train_5.zip
+  # train_f
+  - ../OpenVINO/train_f.zip
+  # validation
+  - ../OpenVINO/validation.zip
+check_validity: true

configs/dataset/rec/ost.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+root: ../OST
+task: str
+download_links:
+  # OST heavy
+  - https://drive.usercontent.google.com/download?id=1RGpIFbD_SRlrzZFBoVF_LGvetNx1-5pg&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1Th4MfDf44k0EBpIqCLqVoGRu6G-FP1hq&authuser=0&confirm=t
+  # OST weak
+  - https://drive.usercontent.google.com/download?id=1z5CTDJucUnvALG12Q4UXk1DDKJDd8WJn&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1V17TTkX3sjpV7v0km_F2SDCK0tL3k_ls&authuser=0&confirm=t
+filenames:
+  # OST heavy
+  - ../OST/heavy/data.mdb
+  - ../OST/heavy/lock.mdb
+  # OST weak
+  - ../OST/weak/data.mdb
+  - ../OST/weak/lock.mdb
+check_validity: true

configs/dataset/rec/synthtext.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+root: ../synth
+task: str
+download_links:
+  - https://drive.usercontent.google.com/download?id=1T-enqkq6_l2HqrsV3da_h0oJ7CUKu_oc&authuser=0&confirm=t
+filenames:
+  - ../synth/ST.zip
+check_validity: true

configs/dataset/rec/test.yaml ADDED Viewed

	@@ -0,0 +1,77 @@

+root: ../test
+task: str
+download_links:
+  # IC13_857
+  - https://drive.usercontent.google.com/download?id=1PZSCbe6_DI8MlCqCRWXGT2PP92_frIXq&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1qkN7NDg0zUHxUiZHAeEatDTqlsgpFWp3&authuser=0&confirm=t
+  # IC15_2077
+  - https://drive.usercontent.google.com/download?id=1dFkY3DNbr-Mepn3TWBiA9COEJ63fGFcp&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1UvVwLNZ3tS1YdTBa8MulPzjeVezKaDro&authuser=0&confirm=t
+  # SVTP
+  - https://drive.usercontent.google.com/download?id=1aofeerilxJ7J3S7QxuCEXbmXTpz8Xshx&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1rJ1KoO4K_VUxEAUN_bMgBGzK8_JZAAno&authuser=0&confirm=t
+  # IIIT5k
+  - https://drive.usercontent.google.com/download?id=1XFO2M1Kbgwv3-iTNTmhQXAEjNmKYOeoT&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1stwK2hFsyaV7HHsEG9EYgnUQebNb2_nG&authuser=0&confirm=t
+  # COCOv1.4
+  - https://drive.usercontent.google.com/download?id=1Se2QSGS19xx7Gfy-SUdX9mlAOr2eYsfA&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1xvekFi389QfkH7yS0JIVV0QzjhUspjDv&authuser=0&confirm=t
+  # IC15_1811
+  - https://drive.usercontent.google.com/download?id=1pHsw8wrThD9EGEE6AusQLZozefSj4iyR&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1TXZ1qHuKAksaAlvd3qMv4IHKnN-IJW9a&authuser=0&confirm=t
+  # Uber
+  - https://drive.usercontent.google.com/download?id=1L2j6BZeLTGQ1FIl8HB_D3AFiWLltGV5r&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=12DUj28yzLWxFO_gfMfSjTkRujYD5MNEE&authuser=0&confirm=t
+  # IC13_1095
+  - https://drive.usercontent.google.com/download?id=1fu8onMt3Z6fDLNAiHcm-sQ2qCXduE-FU&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1OQAZtLj8U2Cl4L0ErGFsz6vGIVTTWasD&authuser=0&confirm=t
+  # IC13_1015
+  - https://drive.usercontent.google.com/download?id=1mbsfuvWB282HYfn9tbqcj1nUDkLXcSNB&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1QGogU_hV-oN7iY2POutdD2LDcmK6plnV&authuser=0&confirm=t
+  # ArT
+  - https://drive.usercontent.google.com/download?id=1-53knSy-uTSngCG7wyBngVyTuTCmdnWl&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=172EsSaf7BVaB1ORtohi-Jc_8SuUKZGGf&authuser=0&confirm=t
+  # SVT
+  - https://drive.usercontent.google.com/download?id=1p7aVUr9Yr7c4X4YUBvk2-YP28rraHjn9&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1ALmhvSleZ0yf-lcdbQPP3M9Zc3oqnXij&authuser=0&confirm=t
+  # CUTE80
+  - https://drive.usercontent.google.com/download?id=1Ujr4axHKnu54P2rIGUhkjdM6XlhDYrI_&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1DvZi9L3MqjO2zRUyCg3YvP4qMAt2bsme&authuser=0&confirm=t
+filenames:
+  # IC13_857
+  - ../test/IC13_857/data.mdb
+  - ../test/IC13_857/lock.mdb
+  # IC15_2077
+  - ../test/IC15_2077/data.mdb
+  - ../test/IC15_2077/lock.mdb
+  # SVTP
+  - ../test/SVTP/data.mdb
+  - ../test/SVTP/lock.mdb
+  # IIIT5k
+  - ../test/IIIT5k/data.mdb
+  - ../test/IIIT5k/lock.mdb
+  # COCOv1.4
+  - ../test/COCOv1.4/data.mdb
+  - ../test/COCOv1.4/lock.mdb
+  # IC15_1811
+  - ../test/IC15_1811/data.mdb
+  - ../test/IC15_1811/lock.mdb
+  # Uber
+  - ../test/Uber/data.mdb
+  - ../test/Uber/lock.mdb
+  # IC13_1095
+  - ../test/IC13_1095/data.mdb
+  - ../test/IC13_1095/lock.mdb
+  # IC13_1015
+  - ../test/IC13_1015/data.mdb
+  - ../test/IC13_1015/lock.mdb
+  # ArT
+  - ../test/ArT/data.mdb
+  - ../test/ArT/lock.mdb
+  # SVT
+  - ../test/SVT/data.mdb
+  - ../test/SVT/lock.mdb
+  # CUTE80
+  - ../test/CUTE80/data.mdb
+  - ../test/CUTE80/lock.mdb
+check_validity: true

configs/dataset/rec/textocr.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+root: ../TextOCR
+task: str
+download_links:
+  # train
+  - https://drive.usercontent.google.com/download?id=1jVjJFno4pnsU0Cp_kn4MIXQrChmELy92&authuser=0&confirm=
+  # val
+  - https://drive.usercontent.google.com/download?id=1ubIRu01MXIek6OvInu-XjaIbw6277-vw&authuser=0&confirm=t
+filenames:
+  # train
+  - ../TextOCR/train.zip
+  # val
+  - ../TextOCR/val.zip
+check_validity: true

configs/dataset/rec/textocr_horizontal.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+root: ../TextOCR_horizontal
+task: str
+download_links:
+  # train
+  - https://drive.usercontent.google.com/download?id=1sWH6J11xbjQb8SH7fdG_8mIKVI81ZQy5&authuser=0&confirm=
+  # val
+  - https://drive.usercontent.google.com/download?id=1gIE-AU2o-5hvg288-bjphO6UkI5AEQ2d&authuser=0&confirm=t
+filenames:
+  # train
+  - ../TextOCR_horizontal/train.zip
+  # val
+  - ../TextOCR_horizontal/val.zip
+check_validity: true

configs/dataset/rec/union14m_b.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+root: ../u14m
+task: str
+download_links:
+  # artistic
+  - https://drive.usercontent.google.com/download?id=1Je2DTuFHnkXDI99yDnm9Anl5naWaCQwd&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1xtT_Q0juBJUIvAG55qBxoVNNTECd2usZ&authuser=0&confirm=t
+  # contextless
+  - https://drive.usercontent.google.com/download?id=1_0OzyzWhZOmGrHkayFTVrzhrQrNRDRPR&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1PPgC42y3xoM9bR0HQFbDYbcT3PzMdD_y&authuser=0&confirm=t
+  # salient
+  - https://drive.usercontent.google.com/download?id=1tHLMYBmTqRnxvFOTT3dfLfQiundqFWfd&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=13NQgpAtCK0kh9M5E2pAUmKKEp6Qu5Xwj&authuser=0&confirm=t
+  # multi_words
+  - https://drive.usercontent.google.com/download?id=1IlnDKX3V_Vp9gsDGFB0xoqsVLH1vtxUI&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1mFFjC7C0CwevvkwFU9YeVbZBdps_3Qpb&authuser=0&confirm=t
+  # curve
+  - https://drive.usercontent.google.com/download?id=1MxhMd85cmhUtI2lmtXhZQuFk7lav0_fw&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1N03g-4e-kJG2mRvlM0c5TrwWAkd-iG-Q&authuser=0&confirm=t
+  # general
+  - https://drive.usercontent.google.com/download?id=1Oqt7OaycP466NWoDmoJ3FqS8YP3YRgvu&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1K0MrX5eYNt8IIGFHXCwg0_oI5OF5PPFO&authuser=0&confirm=t
+  # multi_oriented
+  - https://drive.usercontent.google.com/download?id=1TKZFcZPVk0ThqfF-AGhJk_OCLg0ykKbv&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1PAoLMUWuR7O2-7XRoKkNzQcSiznErQzD&authuser=0&confirm=t
+filenames:
+  # artistic
+  - ../u14m/artistic/data.mdb
+  - ../u14m/artistic/lock.mdb
+  # contextless
+  - ../u14m/contextless/data.mdb
+  - ../u14m/contextless/lock.mdb
+  # salient
+  - ../u14m/salient/data.mdb
+  - ../u14m/salient/lock.mdb
+  # multi_words
+  - ../u14m/multi_words/data.mdb
+  - ../u14m/multi_words/lock.mdb
+  # curve
+  - ../u14m/curve/data.mdb
+  - ../u14m/curve/lock.mdb
+  # general
+  - ../u14m/general/data.mdb
+  - ../u14m/general/lock.mdb
+  # multi_oriented
+  - ../u14m/multi_oriented/data.mdb
+  - ../u14m/multi_oriented/lock.mdb
+check_validity: true

configs/dataset/rec/union14m_l_filtered.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+root: ../Union14M-L-LMDB-Filtered
+task: str
+download_links:
+  # train_challenging
+  - https://drive.usercontent.google.com/download?id=1etwzBgGHjsFsb0sygsaRnKbanW2PMe07&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1ly6FJfPjItwGlVQ-ifTrzzM3rVu3Ezhr&authuser=0&confirm=t
+  # train_easy
+  - https://drive.usercontent.google.com/download?id=1_zeNluTnywIaa5h3PN-Ah9tKyByypot7&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1caYLeQHDidXgVBDi9IWXbO1gg__DYq9a&authuser=0&confirm=t
+  # train_hard
+  - https://drive.usercontent.google.com/download?id=1eP6s2xyYPZX9gykvWA4VSOc3Fqul_UB_&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1-ZlCvocX8P5uVRclUXp_5DNGLDzd16EO&authuser=0&confirm=t
+  # train_medium
+  - https://drive.usercontent.google.com/download?id=1s_CoaLNJEr-UxHYiqZ5jOcliMCFiRUUy&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1Wpj6WVpZ5Ily77kVwfQ18CiZBzkgmEnF&authuser=0&confirm=t
+  # train_normal
+  - https://drive.usercontent.google.com/download?id=1jPt44arlAswl9cXZjzmVcdpptdTPpJ3I&authuser=0&confirm=t
+  - https://drive.usercontent.google.com/download?id=1Rfc5kE03AzOUv7B_eYcBhUV8KMQ2MZ1m&authuser=0&confirm=t
+filenames:
+  # train_challenging
+  - ../Union14M-L-LMDB-Filtered/train_challenging/data.mdb
+  - ../Union14M-L-LMDB-Filtered/train_challenging/lock.mdb
+  # train_easy
+  - ../Union14M-L-LMDB-Filtered/train_easy/data.mdb
+  - ../Union14M-L-LMDB-Filtered/train_easy/lock.mdb
+  # train_hard
+  - ../Union14M-L-LMDB-Filtered/train_hard/data.mdb
+  - ../Union14M-L-LMDB-Filtered/train_hard/lock.mdb
+  # train_medium
+  - ../Union14M-L-LMDB-Filtered/train_medium/data.mdb
+  - ../Union14M-L-LMDB-Filtered/train_medium/lock.mdb
+  # train_normal
+  - ../Union14M-L-LMDB-Filtered/train_normal/data.mdb
+  - ../Union14M-L-LMDB-Filtered/train_normal/lock.mdb
+check_validity: true

configs/det/dbnet/repvit_db.yml ADDED Viewed

	@@ -0,0 +1,171 @@

+Global:
+  device: gpu
+  epoch_num: &epoch_num 500
+  log_smooth_window: 20
+  print_batch_step: 100
+  output_dir: ./output/det_repsvtr_db
+  save_epoch_step: [400, 25]
+  eval_batch_step:
+  - 0
+  - 1000
+  cal_metric_during_train: false
+  checkpoints:
+  pretrained_model: openocr_det_repvit_ch.pth
+  save_inference_dir: null
+  use_tensorboard: false
+  infer_img:
+  save_res_path: ./checkpoints/det_db/predicts_db.txt
+  distributed: true
+  model_type: det
+Architecture:
+  algorithm: DB_mobile
+  Backbone:
+    name: RepSVTR_det
+  Neck:
+    name: RSEFPN
+    out_channels: 96
+    shortcut: True
+  Head:
+    name: DBHead
+    k: 50
+Loss:
+  name: DBLoss
+  balance_loss: true
+  main_loss_type: DiceLoss
+  alpha: 5
+  beta: 10
+  ohem_ratio: 3
+Optimizer:
+  name: Adam
+  lr: 0.001
+  weight_decay: 5.0e-05
+  filter_bias_and_bn: False
+LRScheduler:
+  name: CosineAnnealingLR
+  warmup_epoch: 2
+PostProcess:
+  name: DBPostProcess
+  thresh: 0.3
+  box_thresh: 0.6
+  max_candidates: 1000
+  unclip_ratio: 1.5
+  score_mode: 'slow'
+Metric:
+  name: DetMetric
+  main_indicator: hmean
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ../icdar2015/text_localization/
+    label_file_list:
+      - ../icdar2015/text_localization/train_icdar2015_label.txt
+    ratio_list: [1.0]
+    transforms:
+    - DecodeImage:
+        img_mode: BGR
+        channel_first: false
+    - DetLabelEncode: null
+    - CopyPaste: null
+    - IaaAugment:
+        augmenter_args:
+        - type: Fliplr
+          args:
+            p: 0.5
+        - type: Affine
+          args:
+            rotate:
+            - -10
+            - 10
+        - type: Resize
+          args:
+            size:
+            - 0.5
+            - 3
+    - EastRandomCropData:
+        size:
+        - 640
+        - 640
+        max_tries: 50
+        keep_ratio: true
+    - MakeBorderMap:
+        shrink_ratio: 0.4
+        thresh_min: 0.3
+        thresh_max: 0.7
+        total_epoch: *epoch_num
+    - MakeShrinkMap:
+        shrink_ratio: 0.4
+        min_text_size: 8
+        total_epoch: *epoch_num
+    - NormalizeImage:
+        scale: 1./255.
+        mean:
+        - 0.485
+        - 0.456
+        - 0.406
+        std:
+        - 0.229
+        - 0.224
+        - 0.225
+        order: hwc
+    - ToCHWImage: null
+    - KeepKeys:
+        keep_keys:
+        - image
+        - threshold_map
+        - threshold_mask
+        - shrink_map
+        - shrink_mask
+  loader:
+    shuffle: true
+    drop_last: false
+    batch_size_per_card: 8
+    num_workers: 8
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ../icdar2015/text_localization/
+    label_file_list:
+      - ../icdar2015/text_localization/test_icdar2015_label.txt
+    transforms:
+    - DecodeImage:
+        img_mode: BGR
+        channel_first: false
+    - DetLabelEncode: null
+    - DetResizeForTest:
+        # image_shape: [1280, 1280]
+        # keep_ratio: True
+        # padding: True
+        limit_side_len: 960
+        limit_type: max
+    - NormalizeImage:
+        scale: 1./255.
+        mean:
+        - 0.485
+        - 0.456
+        - 0.406
+        std:
+        - 0.229
+        - 0.224
+        - 0.225
+        order: hwc
+    - ToCHWImage: null
+    - KeepKeys:
+        keep_keys:
+        - image
+        - shape
+        - polys
+        - ignore_tags
+  loader:
+    shuffle: false
+    drop_last: false
+    batch_size_per_card: 1
+    num_workers: 2
+profiler_options: null

configs/rec/abinet/resnet45_trans_abinet_lang.yml ADDED Viewed

	@@ -0,0 +1,94 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/resnet45_trans_abinet_lang/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  # ./openocr_nolang_abinet_lang.pth
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_abinet_lang.txt
+  grad_clip_val: 20
+  use_amp: True
+Optimizer:
+  name: Adam
+  lr: 0.000267
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+LRScheduler:
+  name: MultiStepLR
+  milestones: [12]
+  gamma: 0.1
+Architecture:
+  model_type: rec
+  algorithm: ABINet
+  Transform:
+  Encoder:
+    name: ResNet45
+    in_channels: 3
+    strides: [2, 1, 2, 1, 1]
+  Decoder:
+    name: ABINetDecoder
+    iter_size: 3
+Loss:
+  name: ABINetLoss
+PostProcess:
+  name: ABINetLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ABINetLabelEncode:
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ABINetLabelEncode:
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/abinet/resnet45_trans_abinet_wo_lang.yml ADDED Viewed

	@@ -0,0 +1,93 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/resnet45_trans_abinet_wo_lang/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_abinet_wo_lang.txt
+  grad_clip_val: 20
+  use_amp: True
+Optimizer:
+  name: Adam
+  lr: 0.000267
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+LRScheduler:
+  name: MultiStepLR
+  milestones: [12]
+  gamma: 0.1
+Architecture:
+  model_type: rec
+  algorithm: ABINet
+  Transform:
+  Encoder:
+    name: ResNet45
+    in_channels: 3
+    strides: [2, 1, 2, 1, 1]
+  Decoder:
+    name: ABINetDecoder
+    iter_size: 0
+Loss:
+  name: ABINetLoss
+PostProcess:
+  name: ABINetLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ABINetLabelEncode:
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ABINetLabelEncode:
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/abinet/svtrv2_abinet_lang.yml ADDED Viewed

	@@ -0,0 +1,130 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_abinet_lang/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  # ./openocr_svtrv2_nolang_abinet_lang.pth
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_abinet_lang.txt
+  use_amp: True
+  grad_clip_val: 20
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: ABINet
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: ABINetDecoder
+    iter_size: 3
+    num_layers: 0
+Loss:
+  name: ABINetLoss
+PostProcess:
+  name: ABINetLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ABINetLabelEncode:
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ABINetLabelEncode:
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/abinet/svtrv2_abinet_wo_lang.yml ADDED Viewed

	@@ -0,0 +1,128 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_abinet_wo_lang/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_abinet_wo_lang.txt
+  use_amp: True
+  grad_clip_val: 20
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: ABINet
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: ABINetDecoder
+    iter_size: 0
+    num_layers: 0
+Loss:
+  name: ABINetLoss
+PostProcess:
+  name: ABINetLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ABINetLabelEncode:
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ABINetLabelEncode:
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/aster/resnet31_lstm_aster_tps_on.yml ADDED Viewed

	@@ -0,0 +1,93 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/resnet31_lstm_aster_tps_on
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/predicts_aster_tps.txt
+  use_amp: True
+  grad_clip_val: 1.0
+Optimizer:
+  name: Adam
+  lr: 0.002 # for 1gpus bs1024/gpu
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: aster
+  Transform:
+    name: Aster_TPS
+    tps_inputsize: [32, 64]
+    tps_outputsize: [32, 128]
+  Encoder:
+    name: ResNet_ASTER
+  Decoder:
+    name: ASTERDecoder
+Loss:
+  name: ARLoss
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+PostProcess:
+  name: ARLabelDecode
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 1024
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/aster/svtrv2_aster.yml ADDED Viewed

	@@ -0,0 +1,127 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_aster
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_aster.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: aster
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: False
+  Decoder:
+    name: ASTERDecoder
+Loss:
+  name: ARLoss
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+PostProcess:
+  name: ARLabelDecode
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/aster/svtrv2_aster_tps_on.yml ADDED Viewed

	@@ -0,0 +1,102 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_aster_tps_on
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_aster_tps_on.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: aster
+  Transform:
+    name: Aster_TPS
+    tps_inputsize: [32, 64]
+    tps_outputsize: [32, 128]
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: False
+  Decoder:
+    name: ASTERDecoder
+Loss:
+  name: ARLoss
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+PostProcess:
+  name: ARLabelDecode
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/autostr/autostr_lstm_aster_tps_on.yml ADDED Viewed

	@@ -0,0 +1,95 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/autostr_lstm_aster_tps_on
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_autostr_lstm_aster_tps_on.txt
+  use_amp: True
+  grad_clip_val: 1.0
+Optimizer:
+  name: Adam
+  lr: 0.002 # for 4gpus bs256/gpu
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: autostr
+  Transform:
+    name: Aster_TPS
+    tps_inputsize: [32, 64]
+    tps_outputsize: [32, 128]
+  Encoder:
+    name: AutoSTREncoder
+    stride_stages: '[(2, 2), (2, 1), (2, 2), (2, 1), (2, 1)]'
+    conv_op_ids: [2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 4, 1, 1, 6, 6]
+  Decoder:
+    name: ASTERDecoder
+Loss:
+  name: ARLoss
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+PostProcess:
+  name: ARLabelDecode
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/busnet/svtrv2_busnet.yml ADDED Viewed

	@@ -0,0 +1,135 @@

+Global:
+  device: gpu
+  epoch_num: 10
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_busnet/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  # ./output/rec/u14m_filter/svtrv2_busnet_pretraining/best.pth
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_busnet.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: BUSBet
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: False
+  Decoder:
+    name: BUSDecoder
+    nhead: 6
+    num_layers: 6
+    dim_feedforward: 1536
+    ignore_index: &ignore_index 100
+    pretraining: False
+    # return_id: 2
+Loss:
+  name: ABINetLoss
+  ignore_index: *ignore_index
+PostProcess:
+  name: ABINetLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ABINetLabelEncode:
+          ignore_index: *ignore_index
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ABINetLabelEncode:
+          ignore_index: *ignore_index
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/busnet/svtrv2_busnet_pretraining.yml ADDED Viewed

	@@ -0,0 +1,134 @@

+Global:
+  device: gpu
+  epoch_num: 10
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_busnet_pretraining/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_busnet_pretraining.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: BUSBet
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: False
+  Decoder:
+    name: BUSDecoder
+    nhead: 6
+    num_layers: 6
+    dim_feedforward: 1536
+    ignore_index: &ignore_index 100
+    pretraining: True
+    # return_id: 0
+Loss:
+  name: ABINetLoss
+  ignore_index: *ignore_index
+PostProcess:
+  name: ABINetLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ABINetLabelEncode:
+          ignore_index: *ignore_index
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ABINetLabelEncode:
+          ignore_index: *ignore_index
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/busnet/vit_busnet.yml ADDED Viewed

	@@ -0,0 +1,104 @@

+Global:
+  device: gpu
+  epoch_num: 10
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/vit_busnet/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_vit_busnet.txt
+  grad_clip_val: 20
+  use_amp: True
+Optimizer:
+  name: Adam
+  lr: 0.00053 # 4gpus bs256/gpu
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+LRScheduler:
+  name: MultiStepLR
+  milestones: [6]
+  gamma: 0.1
+Architecture:
+  model_type: rec
+  algorithm: BUSBet
+  Transform:
+  Encoder:
+    name: ViT
+    img_size: [32,128]
+    patch_size: [4, 8]
+    embed_dim: 384
+    depth: 12
+    num_heads: 6
+    mlp_ratio: 4
+    qkv_bias: True
+  Decoder:
+    name: BUSDecoder
+    nhead: 6
+    num_layers: 6
+    dim_feedforward: 1536
+    ignore_index: &ignore_index 100
+    pretraining: False
+Loss:
+  name: ABINetLoss
+  ignore_index: *ignore_index
+PostProcess:
+  name: ABINetLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ABINetLabelEncode:
+          ignore_index: *ignore_index
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ABINetLabelEncode:
+          ignore_index: *ignore_index
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/busnet/vit_busnet_pretraining.yml ADDED Viewed

	@@ -0,0 +1,104 @@

+Global:
+  device: gpu
+  epoch_num: 10
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/vit_busnet_pretraining/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_vit_busnet_pretraining.txt
+  grad_clip_val: 20
+  use_amp: True
+Optimizer:
+  name: Adam
+  lr: 0.00053 # 4gpus bs256/gpu
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+LRScheduler:
+  name: MultiStepLR
+  milestones: [6]
+  gamma: 0.1
+Architecture:
+  model_type: rec
+  algorithm: BUSBet
+  Transform:
+  Encoder:
+    name: ViT
+    img_size: [32,128]
+    patch_size: [4, 8]
+    embed_dim: 384
+    depth: 12
+    num_heads: 6
+    mlp_ratio: 4
+    qkv_bias: True
+  Decoder:
+    name: BUSDecoder
+    nhead: 6
+    num_layers: 6
+    dim_feedforward: 1536
+    ignore_index: &ignore_index 100
+    pretraining: True
+Loss:
+  name: ABINetLoss
+  ignore_index: *ignore_index
+PostProcess:
+  name: ABINetLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ABINetLabelEncode:
+          ignore_index: *ignore_index
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ABINetLabelEncode:
+          ignore_index: *ignore_index
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/cam/convnextv2_cam_tps_on.yml ADDED Viewed

	@@ -0,0 +1,118 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/convnextv2_cam_tps_on
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: False
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: &max_text_length 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_convnextv2_cam_tps_on.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.0008 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+  eps: 1.e-8
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 :  1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: CAM
+  Transform:
+    name: Aster_TPS
+    tps_inputsize: [32, 64]
+    tps_outputsize: &img_shape [32, 128]
+  Encoder:
+    name: CAMEncoder
+    encoder_config:
+      name: ConvNeXtV2
+      depths: [2, 2, 8, 2]
+      dims: [80, 160, 320, 640]
+      strides: [[4,4], [2,1], [2,1], [1,1]]
+      drop_path_rate: 0.2
+      feat2d: True
+    nb_classes: 97
+    strides: [[4,4], [2,1], [2,1], [1,1]]
+    deform_stride: 2
+    stage_idx: 2
+    use_depthwise_unet: True
+    use_more_unet: False
+    binary_loss_type: BanlanceMultiClassCrossEntropyLoss
+    mid_size: True
+    d_embedding: 384
+  Decoder:
+    name: CAMDecoder
+    num_encoder_layers: -1
+    beam_size: 0
+    num_decoder_layers: 2
+    nhead: 8
+    max_len: *max_text_length
+Loss:
+  name: CAMLoss
+  loss_weight_binary: 1.5
+  label_smoothing: 0.
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+PostProcess:
+  name: ARLabelDecode
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - CAMLabelEncode: # Class handling label
+          font_path: ./arial.ttf
+          image_shape: *img_shape
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length', 'binary_mask'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/cam/convnextv2_tiny_cam_tps_on.yml ADDED Viewed

	@@ -0,0 +1,118 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/convnextv2_tiny_cam_tps_on
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: False
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: &max_text_length 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_convnextv2_cam_tps_on.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.0008 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+  eps: 1.e-8
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 :  1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: CAM
+  Transform:
+    name: Aster_TPS
+    tps_inputsize: [32, 64]
+    tps_outputsize: &img_shape [32, 128]
+  Encoder:
+    name: CAMEncoder
+    encoder_config:
+      name: ConvNeXtV2
+      depths: [3, 3, 9, 3]
+      dims: [96, 192, 384, 768]
+      strides: [[4,4], [2,1], [2,1], [1,1]]
+      drop_path_rate: 0.2
+      feat2d: True
+    nb_classes: 97
+    strides: [[4,4], [2,1], [2,1], [1,1]]
+    deform_stride: 2
+    stage_idx: 2
+    use_depthwise_unet: True
+    use_more_unet: False
+    binary_loss_type: BanlanceMultiClassCrossEntropyLoss
+    mid_size: False
+    d_embedding: 512
+  Decoder:
+    name: CAMDecoder
+    num_encoder_layers: -1
+    beam_size: 0
+    num_decoder_layers: 2
+    nhead: 8
+    max_len: *max_text_length
+Loss:
+  name: CAMLoss
+  loss_weight_binary: 1.5
+  label_smoothing: 0.
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+PostProcess:
+  name: ARLabelDecode
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - CAMLabelEncode: # Class handling label
+          font_path: ./arial.ttf
+          image_shape: *img_shape
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length', 'binary_mask'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/cam/svtrv2_cam_tps_on.yml ADDED Viewed

	@@ -0,0 +1,123 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_cam_tps_on
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: False
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: &max_text_length 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_cam_tps_on.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 :  1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: CAM
+  Transform:
+    name: Aster_TPS
+    tps_inputsize: [32, 64]
+    tps_outputsize: &img_shape [32, 128]
+  Encoder:
+    name: CAMEncoder
+    encoder_config:
+      name: SVTRv2LNConvTwo33
+      use_pos_embed: False
+      dims: [128, 256, 384]
+      depths: [6, 6, 6]
+      num_heads: [4, 8, 12]
+      mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+      local_k: [[5, 5], [5, 5], [-1, -1]]
+      sub_k: [[1, 1], [2, 1], [-1, -1]]
+      last_stage: false
+      feat2d: True
+    nb_classes: 97
+    strides: [[4, 4], [1, 1], [2, 1], [1, 1]]
+    k_size: [[2, 2], [1, 1], [2, 1], [1, 1]]
+    q_size: [4, 32]
+    deform_stride: 2
+    stage_idx: 2
+    use_depthwise_unet: True
+    use_more_unet: False
+    binary_loss_type: BanlanceMultiClassCrossEntropyLoss
+    mid_size: True
+    d_embedding: 384
+  Decoder:
+    name: CAMDecoder
+    num_encoder_layers: -1
+    beam_size: 0
+    num_decoder_layers: 2
+    nhead: 8
+    max_len: *max_text_length
+Loss:
+  name: CAMLoss
+  loss_weight_binary: 1.5
+  label_smoothing: 0.
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+PostProcess:
+  name: ARLabelDecode
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - CAMLabelEncode: # Class handling label
+          font_path: ./arial.ttf
+          image_shape: *img_shape
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length', 'binary_mask'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/cdistnet/resnet45_trans_cdistnet.yml ADDED Viewed

	@@ -0,0 +1,93 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/resnet45_trans_cdistnet
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_cdistnet.txt
+  use_amp: True
+  grad_clip_val: 5
+Optimizer:
+  name: Adam
+  lr: 0.002 # for 4gpus bs256/gpu
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: CDistNet
+  Transform:
+  Encoder:
+    name: ResNet45
+    in_channels: 3
+    strides: [2, 1, 2, 1, 1]
+  Decoder:
+    name: CDistNetDecoder
+    add_conv: True
+Loss:
+  name: ARLoss
+PostProcess:
+  name: ARLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/cdistnet/svtrv2_cdistnet.yml ADDED Viewed

	@@ -0,0 +1,139 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_cdistnet/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_cdistnet.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.00065 #4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: CDistNet
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: CDistNetDecoder
+    add_conv: False
+    num_encoder_blocks: 0
+Loss:
+  name: ARLoss
+PostProcess:
+  name: ARLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/cppd/svtr_base_cppd.yml ADDED Viewed

	@@ -0,0 +1,123 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtr_base_cppd/
+  save_epoch_step: [15, 1]
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path
+  # ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_cppd.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: CPPD
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRNet
+    img_size: [32, 128]
+    out_char_num: 25
+    out_channels: 256
+    patch_merging: 'Conv'
+    embed_dim: [128, 256, 384]
+    depth: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
+    local_mixer: [[5, 5], [5, 5], [5, 5]]
+    last_stage: False
+    prenorm: True
+  Decoder:
+    name: CPPDDecoder
+    vis_seq: 64
+    num_layer: 2
+    pos_len: False
+    rec_layer: 1
+Loss:
+  name: CPPDLoss
+  ignore_index: 100
+  smoothing: True
+  pos_len: False
+  sideloss_weight: 1.0
+PostProcess:
+  name: CPPDLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - CPPDLabelEncode: # Class handling label
+          pos_len: False
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation/
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - CPPDLabelEncode: # Class handling label
+          pos_len: False
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 128
+    num_workers: 4

configs/rec/cppd/svtr_base_cppd_ch.yml ADDED Viewed

	@@ -0,0 +1,126 @@

+Global:
+  device: gpu
+  epoch_num: 100
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/ch/svtr_base_cppd/
+  save_epoch_step: [15, 1]
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 2000]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: False
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/ppocr_keys_v1.txt
+  # ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/ch/predicts_svtr_base_cppd.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.0005 # for 4gpus bs128/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: CosineAnnealingLR
+  warmup_epoch: 5
+Architecture:
+  model_type: rec
+  algorithm: CPPD
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRNet
+    img_size: [32, 256]
+    patch_merging: 'Conv'
+    embed_dim: [128, 256, 384]
+    depth: [6, 6, 4]
+    num_heads: [4, 8, 12]
+    mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
+    local_mixer: [[5, 5], [5, 5], [5, 5]]
+    last_stage: False
+    prenorm: True
+  Decoder:
+    name: CPPDDecoder
+    vis_seq: 128
+    num_layer: 3
+    pos_len: False
+    rec_layer: 1
+    ch: True
+Loss:
+  name: CPPDLoss
+  ignore_index: 7000
+  smoothing: True
+  pos_len: False
+  sideloss_weight: 1.0
+PostProcess:
+  name: CPPDLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../benchmark_bctr/benchmark_bctr_train
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - CPPDLabelEncode: # Class handling label
+          pos_len: False
+          ch: True
+          ignore_index: 7000
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - SVTRResize:
+          image_shape: [3, 32, 256]
+          padding: True
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'label_node', 'label_index', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 128
+    drop_last: True
+    num_workers: 8
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../benchmark_bctr/benchmark_bctr_test/scene_test
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - CPPDLabelEncode: # Class handling label
+          pos_len: False
+          ch: True
+          ignore_index: 7000
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - SVTRResize:
+          image_shape: [3, 32, 256]
+          padding: True
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 4

configs/rec/cppd/svtr_base_cppd_h8.yml ADDED Viewed

	@@ -0,0 +1,123 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtr_base_h8_cppd/
+  save_epoch_step: [15, 1]
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_cppd.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: CPPD
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRNet
+    img_size: [32, 128]
+    out_char_num: 25
+    out_channels: 256
+    patch_merging: 'Conv'
+    embed_dim: [128, 256, 384]
+    depth: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    sub_k: [[1, 1], [2, 1]]
+    mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
+    local_mixer: [[5, 5], [5, 5], [5, 5]]
+    last_stage: False
+    prenorm: True
+  Decoder:
+    name: CPPDDecoder
+    vis_seq: 128
+    num_layer: 2
+    pos_len: False
+    rec_layer: 1
+Loss:
+  name: CPPDLoss
+  ignore_index: 100
+  smoothing: True
+  pos_len: False
+  sideloss_weight: 1.0
+PostProcess:
+  name: CPPDLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - CPPDLabelEncode: # Class handling label
+          pos_len: False
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation/
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - CPPDLabelEncode: # Class handling label
+          pos_len: False
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 128
+    num_workers: 4

configs/rec/cppd/svtr_base_cppd_syn.yml ADDED Viewed

	@@ -0,0 +1,124 @@

+Global:
+  device: gpu
+  epoch_num: 60
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/syn/svtr_base_cppd/
+  save_epoch_step: [15, 1]
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path
+  # ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/syn/predicts_svtr_base_cppd.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.0005 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: CosineAnnealingLR
+  warmup_epoch: 6
+Architecture:
+  model_type: rec
+  algorithm: CPPD
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRNet
+    img_size: [32, 100]
+    out_char_num: 25
+    out_channels: 256
+    patch_merging: 'Conv'
+    embed_dim: [128, 256, 384]
+    depth: [6, 6, 4]
+    num_heads: [4, 8, 12]
+    mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
+    local_mixer: [[5, 5], [5, 5], [5, 5]]
+    last_stage: False
+    prenorm: True
+  Decoder:
+    name: CPPDDecoder
+    vis_seq: 50
+    num_layer: 3
+    pos_len: False
+    rec_layer: 1
+Loss:
+  name: CPPDLoss
+  ignore_index: 100
+  smoothing: True
+  pos_len: False
+  sideloss_weight: 1.0
+PostProcess:
+  name: CPPDLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+Train:
+  dataset:
+    name: STRLMDBDataSet
+    data_dir: ./
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+    #   - SVTRRAug:
+      - CPPDLabelEncode: # Class handling label
+          pos_len: False
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - SVTRResize:
+          image_shape: [3, 32, 100]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 8
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation/
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - CPPDLabelEncode: # Class handling label
+          pos_len: False
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - SVTRResize:
+          image_shape: [3, 32, 100]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 4

configs/rec/cppd/svtrv2_cppd.yml ADDED Viewed

	@@ -0,0 +1,150 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_cppd/
+  save_epoch_step: [15, 1]
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_cppd.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: CPPD
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: False
+  Decoder:
+    name: CPPDDecoder
+    ds: True
+    num_layer: 2
+    pos_len: False
+    rec_layer: 1
+Loss:
+  name: CPPDLoss
+  ignore_index: 100
+  smoothing: True
+  pos_len: False
+  sideloss_weight: 1.0
+PostProcess:
+  name: CPPDLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - CPPDLabelEncode: # Class handling label
+          pos_len: False
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - CPPDLabelEncode: # Class handling label
+          pos_len: False
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/dan/resnet45_fpn_dan.yml ADDED Viewed

	@@ -0,0 +1,98 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/resnet45_fpn_dan/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_resnet45_fpn_dan.txt
+  use_amp: True
+  grad_clip_val: 20
+Optimizer:
+  name: Adam
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: DAN
+  Transform:
+  Encoder:
+    name: ResNet45
+    in_channels: 3
+    strides: [2, 1, 2, 1, 1]
+    return_list: True
+  Decoder:
+    name: DANDecoder
+    max_len: 25
+    channels_list: [64, 128, 256, 512]
+    strides_list: [[2, 2], [1, 1], [1, 1]]
+    in_shape: [8, 32]
+    depth: 4
+Loss:
+  name: ARLoss
+PostProcess:
+  name: ARLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode:
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode:
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/dan/svtrv2_dan.yml ADDED Viewed

	@@ -0,0 +1,130 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_dan
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_dan.txt
+  use_amp: True
+  grad_clip_val: 20
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # 4gpus 256bs/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: DAN
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: DANDecoder
+    use_cam: False
+    max_len: 25
+Loss:
+  name: ARLoss
+PostProcess:
+  name: ARLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode:
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode:
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/dptr/dptr_parseq_pretrain.yml ADDED Viewed

	@@ -0,0 +1,88 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: /share/ckpt/zhaoshuai/openocr/dptr_parseq/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  use_amp: True
+  save_res_path: /share/ckpt/zhaoshuai/openocr/dptr_parseq/predicts_dptr_parseq.txt
+  grad_clip_val: 20
+Optimizer:
+  name: AdamW
+  lr: 0.001485 # 2gpus 384bs/gpu
+  weight_decay: 0.
+  filter_bias_and_bn: False
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: DPTR
+  Decoder:
+    name: DptrParseq
+    decode_ar: True
+    refine_iters: 1
+    is_pretrain: True
+    ORP_path: /share/ckpt/zhaoshuai/parseq/clip_background.pth
+Loss:
+  name: PARSeqLoss
+PostProcess:
+  name: ARLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: TextLMDBDataSet
+    data_dir: /share/test/zhaoshuai/parseq-data/data/train/real/ArT
+    transforms:
+      - DPTRLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['clip_label', 'label'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: TextLMDBDataSet
+    data_dir: /share/test/zhaoshuai/parseq-data/data/val
+    transforms:
+      - DPTRLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['clip_label', 'label'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/focalsvtr/focalsvtr_ctc.yml ADDED Viewed

	@@ -0,0 +1,137 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/focalsvtr_ctc/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path
+  # ./tools/utils/EN_symbol_dict.txt
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_focalsvtr_ctc.txt
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: SVTR
+  Transform:
+  Encoder:
+    name: FocalSVTR
+    img_size: [32, 128]
+    depths: [6, 6, 6]
+    embed_dim: 96
+    sub_k: [[1, 1], [2, 1], [1, 1]]
+    focal_levels: [3, 3, 3]
+    out_channels: 256
+    last_stage: True
+  Decoder:
+    name: CTCDecoder
+Loss:
+  name: CTCLoss
+  zero_infinity: True
+PostProcess:
+  name: CTCLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSet
+    ds_width: True
+    padding: &padding False
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - PARSeqAug:
+      - CTCLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length']
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: 12
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSet
+    ds_width: True
+    padding: True
+    data_dir_list: ['../evaluation/CUTE80',
+    '../evaluation/IC13_857',
+    '../evaluation/IC15_1811',
+    '../evaluation/IIIT5k',
+    '../evaluation/SVT',
+    '../evaluation/SVTP',
+    ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - CTCLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length']
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: 128
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 128
+    max_ratio: 12
+    num_workers: 4

configs/rec/gtc/svtrv2_lnconv_nrtr_gtc.yml ADDED Viewed

	@@ -0,0 +1,168 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/svtrv2_lnconv_nrtr_gtc
+  save_epoch_step: [15, 1]
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img: ../ltb/img
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/predicts_smtr.txt
+  use_amp: True
+  distributed: true
+Optimizer:
+  name: AdamW
+  lr: 0.00065
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: BGPD
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: GTCDecoder
+    infer_gtc: True
+    detach: False
+    gtc_decoder:
+      name: NRTRDecoder
+      num_encoder_layers: -1
+      beam_size: 0
+      num_decoder_layers: 2
+      nhead: 12
+      max_len: *max_text_length
+    ctc_decoder:
+      name: RCTCDecoder
+Loss:
+  name: GTCLoss
+  gtc_loss:
+    name: ARLoss
+PostProcess:
+  name: GTCLabelDecode
+  gtc_label_decode:
+    name: ARLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecGTCMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSet
+    ds_width: True
+    # max_ratio: &max_ratio 4
+    # min_ratio: 1
+    # base_shape: &base_shape [[64, 64], [96, 48], [112, 40], [128, 32]]
+    # base_h: &base_h 32
+    # padding: &padding False
+    padding: false
+    # padding_rand: true
+    # padding_doub: true
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - PARSeqAug:
+      - GTCLabelEncode: # Class handling label
+          gtc_label_encode:
+            name: ARLabelEncode
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSet
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - GTCLabelEncode: # Class handling label
+          gtc_label_encode:
+            name: ARLabelEncode
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/gtc/svtrv2_lnconv_smtr_gtc_long_infer.yml ADDED Viewed

	@@ -0,0 +1,151 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/svtrv2_lnconv_smtr_gtc_long_infer
+  save_epoch_step: [15, 1]
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 1000]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img: ../ltb/img
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/predicts_smtr.txt
+  use_amp: True
+  distributed: true
+Optimizer:
+  name: AdamW
+  lr: 0.000325
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: BGPD
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: GTCDecoder
+    infer_gtc: False
+    detach: False
+    gtc_decoder:
+      name: SMTRDecoder
+      num_layer: 1
+      ds: True
+      max_len: *max_text_length
+      next_mode: &next True
+      sub_str_len: &subsl 5
+    ctc_decoder:
+      name: RCTCDecoder
+Loss:
+  name: CTCLoss
+PostProcess:
+  name: CTCLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - CTCLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 128
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 12
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - CTCLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/gtc/svtrv2_lnconv_smtr_gtc_smtr_long.yml ADDED Viewed

	@@ -0,0 +1,150 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/svtrv2_lnconv_smtr_gtc_nodetach_smtr_long_infer
+  save_epoch_step: [15, 1]
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 1000]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/predicts_smtr.txt
+  use_amp: True
+  distributed: true
+Optimizer:
+  name: AdamW
+  lr: 0.000325
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: BGPD
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: GTCDecoder
+    infer_gtc: True
+    detach: False
+    gtc_decoder:
+      name: SMTRDecoder
+      num_layer: 1
+      ds: True
+      max_len: *max_text_length
+      next_mode: &next True
+      sub_str_len: &subsl 5
+      infer_aug: True
+    ctc_decoder:
+      name: RCTCDecoder
+Loss:
+  name: GTCLoss
+  ctc_weight: 0.1
+  gtc_loss:
+    name: SMTRLoss
+PostProcess:
+  name: GTCLabelDecode
+  gtc_label_decode:
+    name: SMTRLabelDecode
+    next_mode: *next
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+  only_gtc: True
+Metric:
+  name: RecGTCMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - SMTRLabelEncode: # Class handling label
+          sub_str_len: *subsl
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'label_subs', 'label_next', 'length_subs',
+          'label_subs_pre', 'label_next_pre', 'length_subs_pre', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 12
+    num_workers: 4
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ../ltb/
+    label_file_list: ['../ltb/ultra_long_70_list.txt']
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - GTCLabelEncode: # Class handling label
+          gtc_label_encode:
+            name: ARLabelEncode
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: 200
+      - SliceResize:
+          image_shape: [3, 32, 128]
+          padding: False
+          max_ratio: 12
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 1
+    num_workers: 2

configs/rec/gtc/svtrv2_lnconv_smtr_gtc_stream.yml ADDED Viewed

	@@ -0,0 +1,152 @@

+Global:
+  device: gpu
+  epoch_num: 60
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/svtrv2_lnconv_smtr_gtc_stream
+  save_epoch_step: [15, 1]
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/predicts_smtr.txt
+  use_amp: True
+  distributed: true
+  grad_clip_val: 20
+Optimizer:
+  name: AdamW
+  lr: 0.00065
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: BGPD
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: GTCDecoder
+    infer_gtc: True
+    detach: False
+    gtc_decoder:
+      name: SMTRDecoder
+      num_layer: 1
+      ds: True
+      max_len: *max_text_length
+      next_mode: &next True
+      sub_str_len: &subsl 5
+      infer_aug: False
+    ctc_decoder:
+      name: RCTCDecoder
+Loss:
+  name: GTCLoss
+  ctc_weight: 0.25
+  gtc_loss:
+    name: SMTRLoss
+PostProcess:
+  name: GTCLabelDecode
+  gtc_label_decode:
+    name: SMTRLabelDecode
+    next_mode: *next
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+  only_gtc: True
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+  stream: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - SMTRLabelEncode: # Class handling label
+          sub_str_len: *subsl
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'label_subs', 'label_next', 'length_subs',
+          'label_subs_pre', 'label_next_pre', 'length_subs_pre', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 12
+    num_workers: 4
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ../ltb/
+    label_file_list: ['../ltb/ultra_long_70_list.txt']
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - GTCLabelEncode: # Class handling label
+          gtc_label_encode:
+            name: ARLabelEncode
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - SliceTVResize:
+          image_shape: [32, 128]
+          padding: False
+          max_ratio: 4
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 1
+    num_workers: 2

configs/rec/igtr/readme.md ADDED Viewed

	@@ -0,0 +1,192 @@

+# IGTR
+- [IGTR](#igtr)
+  - [1. Introduction](#1-introduction)
+  - [2. Environment](#2-environment)
+    - [Dataset Preparation](#dataset-preparation)
+  - [3. Model Training / Evaluation](#3-model-training--evaluation)
+  - [Citation](#citation)
+<a name="1"></a>
+## 1. Introduction
+Paper:
+> [Instruction-Guided Scene Text Recognition](https://arxiv.org/abs/2401.17851),
+> Yongkun Du, Zhineng Chen, Yuchen Su, Caiyan Jia, Yu-Gang Jiang,
+> TPAMI
+<a name="model"></a>
+Multi-modal models have shown appealing performance in visual recognition tasks, as free-form text-guided training evokes the ability to understand fine-grained visual content. However, current models cannot be trivially applied to scene text recognition (STR) due to the compositional difference between natural and text images. We propose a novel instruction-guided scene text recognition (IGTR) paradigm that formulates STR as an instruction learning problem and understands text images by predicting character attributes, e.g., character frequency, position, etc. IGTR first devises $\\left \\langle condition,question,answer\\right \\rangle$ instruction triplets, providing rich and diverse descriptions of character attributes. To effectively learn these attributes through question-answering, IGTR develops a lightweight instruction encoder, a cross-modal feature fusion module and a multi-task answer head, which guides nuanced text image understanding. Furthermore, IGTR realizes different recognition pipelines simply by using different instructions, enabling a character-understanding-based text reasoning paradigm that differs from current methods considerably. Experiments on English and Chinese benchmarks show that IGTR outperforms existing models by significant margins, while maintaining a small model size and fast inference speed. Moreover, by adjusting the sampling of instructions, IGTR offers an elegant way to tackle the recognition of rarely appearing and morphologically similar characters, which were previous challenges.
+<a name="model"></a>
+The accuracy (%) and model files of IGTR on the public dataset of scene text recognition are as follows:
+- Trained on Synth dataset(MJ+ST), test on Common Benchmarks, training and test datasets both from [PARSeq](https://github.com/baudm/parseq).
+|  Model  | IC13<br/>857 | SVT  | IIIT5k<br/>3000 | IC15<br/>1811 | SVTP | CUTE80 |  Avg  |                                        Config&Model&Log                                         |
+| :-----: | :----------: | :--: | :-------------: | :-----------: | :--: | :----: | :---: | :---------------------------------------------------------------------------------------------: |
+| IGTR-PD |     97.6     | 95.2 |      97.6       |     88.4      | 91.6 |  95.5  | 94.30 | [link](https://drive.google.com/drive/folders/1Pv0CW2hiWC_dIyaB74W1fsXqiX3z5yXA?usp=drive_link) |
+| IGTR-AR |     98.6     | 95.7 |      98.2       |     88.4      | 92.4 |  95.5  | 94.78 |                                            as above                                             |
+- Test on Union14M-L benchmark, from [Union14M](https://github.com/Mountchicken/Union14M/).
+|  Model  | Curve | Multi-<br/>Oriented | Artistic | Contextless | Salient | Multi-<br/>word | General |  Avg  |    Config&Model&Log     |
+| :-----: | :---: | :-----------------: | :------: | :---------: | :-----: | :-------------: | :-----: | :---: | :---------------------: |
+| IGTR-PD | 76.9  |        30.6         |   59.1   |    63.3     |  77.8   |      62.5       |  66.7   | 62.40 | Same as the above table |
+| IGTR-AR | 78.4  |        31.9         |   61.3   |    66.5     |  80.2   |      69.3       |  67.9   | 65.07 |        as above         |
+- Trained on Union14M-L training dataset.
+|    Model     | IC13<br/>857 | SVT  | IIIT5k<br/>3000 | IC15<br/>1811 | SVTP | CUTE80 |  Avg  |                                        Config&Model&Log                                         |
+| :----------: | :----------: | :--: | :-------------: | :-----------: | :--: | :----: | :---: | :---------------------------------------------------------------------------------------------: |
+|   IGTR-PD    |     97.7     | 97.7 |      98.3       |     89.8      | 93.7 |  97.9  | 95.86 | [link](https://drive.google.com/drive/folders/1ZGlzDqEzjrBg8qG2wBkbOm3bLRzFbTzo?usp=drive_link) |
+|   IGTR-AR    |     98.1     | 98.4 |      98.7       |     90.5      | 94.9 |  98.3  | 96.48 |                                            as above                                             |
+| IGTR-PD-60ep |     97.9     | 98.3 |      99.2       |     90.8      | 93.7 |  97.6  | 96.24 | [link](https://drive.google.com/drive/folders/1ik4hxZDRsjU1RbCA19nwE45Kg1bCnMoa?usp=drive_link) |
+| IGTR-AR-60ep |     98.4     | 98.1 |      99.3       |     91.5      | 94.3 |  97.6  | 96.54 |                                            as above                                             |
+|  IGTR-PD-PT  |     98.6     | 98.0 |      99.1       |     91.7      | 96.8 |  99.0  | 97.20 | [link](https://drive.google.com/drive/folders/1QM0EWV66IfYI1G0Xm066V2zJA62hH6-1?usp=drive_link) |
+|  IGTR-AR-PT  |     98.8     | 98.3 |      99.2       |     92.0      | 96.8 |  99.0  | 97.34 |                                            as above                                             |
+|    Model     | Curve | Multi-<br/>Oriented | Artistic | Contextless | Salient | Multi-<br/>word | General |  Avg  |    Config&Model&Log     |
+| :----------: | :---: | :-----------------: | :------: | :---------: | :-----: | :-------------: | :-----: | :---: | :---------------------: |
+|   IGTR-PD    | 88.1  |        89.9         |   74.2   |    80.3     |  82.8   |      79.2       |  83.0   | 82.51 | Same as the above table |
+|   IGTR-AR    | 90.4  |        91.2         |   77.0   |    82.4     |  84.7   |      84.0       |  84.4   | 84.86 |        as above         |
+| IGTR-PD-60ep | 90.0  |        92.1         |   77.5   |    82.8     |  86.0   |      83.0       |  84.8   | 85.18 | Same as the above table |
+| IGTR-AR-60ep | 91.0  |        93.0         |   78.7   |    84.6     |  87.3   |      84.8       |  85.6   | 86.43 |        as above         |
+|  IGTR-PD-PT  | 92.4  |        92.1         |   80.7   |    83.6     |  87.7   |      86.9       |  85.0   | 86.92 | Same as the above table |
+|  IGTR-AR-PT  | 93.0  |        92.9         |   81.3   |    83.4     |  88.6   |      88.7       |  85.6   | 87.65 |        as above         |
+- Trained and test on Chinese dataset, from [Chinese Benckmark](https://github.com/FudanVI/benchmarking-chinese-text-recognition).
+|    Model    | Scene | Web  | Document | Handwriting |  Avg  |                                        Config&Model&Log                                         |
+| :---------: | :---: | :--: | :------: | :---------: | :---: | :---------------------------------------------------------------------------------------------: |
+|   IGTR-PD   | 73.1  | 74.8 |   98.6   |    52.5     | 74.75 |                                                                                                 |
+|   IGTR-AR   | 75.1  | 76.4 |   98.7   |    55.3     | 76.37 |                                                                                                 |
+| IGTR-PD-TS  | 73.5  | 75.9 |   98.7   |    54.5     | 75.65 | [link](https://drive.google.com/drive/folders/1H3VRdGHjhawd6fkSC-qlBzVzvYYTpHRg?usp=drive_link) |
+| IGTR-AR-TS  | 75.6  | 77.0 |   98.8   |    57.3     | 77.17 |                                            as above                                             |
+| IGTR-PD-Aug | 79.5  | 80.0 |   99.4   |    58.9     | 79.45 | [link](https://drive.google.com/drive/folders/1XFQkCILwcFwA7iYyQY9crnrouaI5sqcZ?usp=drive_link) |
+| IGTR-AR-Aug | 82.0  | 81.7 |   99.5   |    63.8     | 81.74 |                                            as above                                             |
+Download all Configs, Models, and Logs from [Google Drive](https://drive.google.com/drive/folders/1mSRDg9Mj5R6PspAdFGXZHDHTCQmjkd8d?usp=drive_link).
+<a name="2"></a>
+## 2. Environment
+- [PyTorch](http://pytorch.org/) version >= 1.13.0
+- Python version >= 3.7
+```shell
+git clone -b develop https://github.com/Topdu/OpenOCR.git
+cd OpenOCR
+# A100 Ubuntu 20.04 Cuda 11.8
+conda create -n openocr python==3.8
+conda activate openocr
+conda install pytorch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 pytorch-cuda=11.8 -c pytorch -c nvidia
+pip install -r requirements.txt
+```
+#### Dataset Preparation
+- [English dataset download](https://github.com/baudm/parseq)
+- [Union14M-L-LMDB-Filtered download](https://drive.google.com/drive/folders/1OlDWJZgvd6s4S09S3IGeAI90jI0i7AB_?usp=sharing)
+- [Chinese dataset download](https://github.com/fudanvi/benchmarking-chinese-text-recognition#download)
+The expected filesystem structure is as follows:
+```
+benchmark_bctr
+├── benchmark_bctr_test
+│   ├── document_test
+│   ├── handwriting_test
+│   ├── scene_test
+│   └── web_test
+└── benchmark_bctr_train
+    ├── document_train
+    ├── handwriting_train
+    ├── scene_train
+    └── web_train
+evaluation
+├── CUTE80
+├── IC13_857
+├── IC15_1811
+├── IIIT5k
+├── SVT
+└── SVTP
+OpenOCR
+synth
+├── MJ
+│   ├── test
+│   ├── train
+│   └── val
+└── ST
+test # from PARSeq
+├── ArT
+├── COCOv1.4
+├── CUTE80
+├── IC13_1015
+├── IC13_1095
+├── IC13_857
+├── IC15_1811
+├── IC15_2077
+├── IIIT5k
+├── SVT
+├── SVTP
+└── Uber
+u14m # lmdb format
+├── artistic
+├── contextless
+├── curve
+├── general
+├── multi_oriented
+├── multi_words
+└── salient
+Union14M-L-LMDB-Filtered # lmdb format
+├── train_challenging
+├── train_easy
+├── train_hard
+├── train_medium
+└── train_normal
+```
+<a name="3"></a>
+## 3. Model Training / Evaluation
+Training:
+```shell
+# The configuration file is available from the link provided in the table above.
+# Multi GPU training
+CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 tools/train_rec.py --c PATH/svtr_base_igtr_XXX.yml
+```
+Evaluation:
+```shell
+# The configuration file is available from the link provided in the table above.
+# en
+python tools/eval_rec_all_en.py --c PATH/svtr_base_igtr_syn.yml
+# ch
+python tools/eval_rec_all_ch.py --c PATH/svtr_base_igtr_ch_aug.yml
+```
+## Citation
+If you find our method useful for your reserach, please cite:
+```bibtex
+@article{Du2024IGTR,
+  title     = {Instruction-Guided Scene Text Recognition},
+  author    = {Du, Yongkun and Chen, Zhineng and Su, Yuchen and Jia, Caiyan and Jiang, Yu-Gang},
+  journal   = {CoRR},
+  eprinttype = {arXiv},
+  primaryClass={cs.CV},
+  volume    = {abs/2401.17851},
+  year      = {2024},
+  url       = {https://arxiv.org/abs/2401.17851}
+}
+```

configs/rec/igtr/svtr_base_ds_igtr.yml ADDED Viewed

	@@ -0,0 +1,157 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtr_base_igtr
+  save_epoch_step: [15, 1]
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path
+  # ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_igtr.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.0005 # 2gpus 384bs/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: IGTR
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRNet2DPos
+    img_size: [32, -1]
+    out_char_num: 25
+    out_channels: 256
+    patch_merging: 'Conv'
+    embed_dim: [128, 256, 384]
+    depth: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: ['ConvB','ConvB','ConvB','ConvB','ConvB','ConvB', 'ConvB','ConvB', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
+    local_mixer: [[5, 5], [5, 5], [5, 5]]
+    last_stage: False
+    prenorm: True
+    use_first_sub: False
+  Decoder:
+    name: IGTRDecoder
+    dim: 384
+    num_layer: 1
+    ar: False
+    refine_iter: 0
+    # next_pred: True
+    next_pred: False
+    pos2d: True
+    ds: True
+    # pos_len: False
+    # rec_layer: 1
+Loss:
+  name: IGTRLoss
+PostProcess:
+  name: IGTRLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+Train:
+  dataset:
+    name: RatioDataSet
+    ds_width: True
+    padding: &padding False
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - PARSeqAug:
+      - IGTRLabelEncode: # Class handling label
+          k: 8
+          prompt_error: False
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'prompt_pos_idx_list',
+          'prompt_char_idx_list', 'ques_pos_idx_list', 'ques1_answer_list',
+          'ques2_char_idx_list', 'ques2_answer_list', 'ques3_answer', 'ques4_char_num_list',
+          'ques_len_list', 'ques2_len_list', 'prompt_len_list', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 384
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSet
+    ds_width: True
+    padding: *padding
+    data_dir_list: ['../evaluation/CUTE80',
+    '../evaluation/IC13_857',
+    '../evaluation/IC15_1811',
+    '../evaluation/IIIT5k',
+    '../evaluation/SVT',
+    '../evaluation/SVTP']
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/lister/focalsvtr_lister_wo_fem_maxratio12.yml ADDED Viewed

	@@ -0,0 +1,133 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/focalsvtr_lister_wo_fem_maxratio12/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_focalsvtr_lister_wo_fem_maxratio12.txt
+  use_amp: True
+  grad_clip_val: 20
+Optimizer:
+  name: AdamW
+  lr: 0.00065
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: LISTER
+  Transform:
+  Encoder:
+    name: FocalSVTR
+    img_size: [32, 128]
+    depths: [6, 6, 9]
+    embed_dim: 96
+    sub_k: [[1, 1], [2, 1], [1, 1]]
+    focal_levels: [3, 3, 3]
+    last_stage: False
+    feat2d: True
+  Decoder:
+    name: LISTERDecoder
+    detach_grad: False
+    attn_scaling: True
+    use_fem: False
+Loss:
+  name: LISTERLoss
+PostProcess:
+  name: LISTERLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - EPLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length']
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: 12
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: ['../evaluation/CUTE80',
+    '../evaluation/IC13_857',
+    '../evaluation/IC15_1811',
+    '../evaluation/IIIT5k',
+    '../evaluation/SVT',
+    '../evaluation/SVTP',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - EPLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length']
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: 12
+    num_workers: 4

configs/rec/lister/svtrv2_lister_wo_fem_maxratio12.yml ADDED Viewed

	@@ -0,0 +1,138 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_lister_wo_fem_maxratio12/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_lister_wo_fem_maxratio12.txt
+  use_amp: True
+  grad_clip_val: 20
+Optimizer:
+  name: AdamW
+  lr: 0.000325
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: LISTER
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: LISTERDecoder
+    detach_grad: False
+    attn_scaling: True
+    use_fem: False
+Loss:
+  name: LISTERLoss
+PostProcess:
+  name: LISTERLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - EPLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length']
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 128
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: 12
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: ['../evaluation/CUTE80',
+    '../evaluation/IC13_857',
+    '../evaluation/IC15_1811',
+    '../evaluation/IIIT5k',
+    '../evaluation/SVT',
+    '../evaluation/SVTP',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - EPLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: 12
+    num_workers: 4

configs/rec/lpv/svtr_base_lpv.yml ADDED Viewed

	@@ -0,0 +1,124 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtr_base_lpv/
+  save_epoch_step: [15, 1]
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  # ./output/rec/u14m_filter/svtr_base_lpv_wo_glrm/best.pth
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtr_lpv.txt
+  use_amp: True
+  grad_clip_val: 20
+Optimizer:
+  name: Adam
+  lr: 0.0001 # for 4gpus bs128/gpu
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+  betas: [0.9, 0.99]
+LRScheduler:
+  name: MultiStepLR
+  milestones: [12]
+  gamma: 0.1
+Architecture:
+  model_type: rec
+  algorithm: LPV
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRNet
+    img_size: [32, 128]
+    out_char_num: 25
+    out_channels: 256
+    patch_merging: 'Conv'
+    embed_dim: [128, 256, 384]
+    depth: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
+    local_mixer: [[5, 5], [5, 5], [5, 5]]
+    sub_k: [[1, 1], [1, 1]]
+    feature2d: True
+    last_stage: False
+    prenorm: True
+  Decoder:
+    name: LPVDecoder
+    num_layer: 3
+    max_len: *max_text_length
+    use_mask: True
+    dim_feedforward: 1536
+    nhead: 12
+    dropout: 0.1
+    trans_layer: 3
+Loss:
+  name: LPVLoss
+PostProcess:
+  name: ARLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 128
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation/
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 128
+    num_workers: 4

configs/rec/lpv/svtr_base_lpv_wo_glrm.yml ADDED Viewed

	@@ -0,0 +1,123 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtr_base_lpv_wo_glrm/
+  save_epoch_step: [15, 1]
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_lpv_wo_glrm.txt
+  use_amp: True
+  grad_clip_val: 20
+Optimizer:
+  name: Adam
+  lr: 0.0001 # for 4gpus bs128/gpu
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+  betas: [0.9, 0.99]
+LRScheduler:
+  name: MultiStepLR
+  milestones: [12]
+  gamma: 0.1
+Architecture:
+  model_type: rec
+  algorithm: LPV
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRNet
+    img_size: [32, 128]
+    out_char_num: 25
+    out_channels: 256
+    patch_merging: 'Conv'
+    embed_dim: [128, 256, 384]
+    depth: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
+    local_mixer: [[5, 5], [5, 5], [5, 5]]
+    sub_k: [[1, 1], [1, 1]]
+    feature2d: True
+    last_stage: False
+    prenorm: True
+  Decoder:
+    name: LPVDecoder
+    num_layer: 3
+    max_len: *max_text_length
+    use_mask: False
+    dim_feedforward: 1536
+    nhead: 12
+    dropout: 0.1
+    trans_layer: 3
+Loss:
+  name: LPVLoss
+PostProcess:
+  name: ARLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 128
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation/
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 128
+    num_workers: 4

configs/rec/lpv/svtrv2_lpv.yml ADDED Viewed

	@@ -0,0 +1,147 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_lpv/
+  save_epoch_step: [15, 1]
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  # ./output/rec/u14m_filter/svtrv2_lpv_wo_glrm/best.pth
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_lpv.txt
+  use_amp: True
+  grad_clip_val: 20
+Optimizer:
+  name: AdamW
+  lr: 0.000325 # for 4gpus bs128/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: LPV
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: LPVDecoder
+    num_layer: 3
+    max_len: *max_text_length
+    use_mask: True
+    dim_feedforward: 1536
+    nhead: 12
+    dropout: 0.1
+    trans_layer: 3
+Loss:
+  name: LPVLoss
+PostProcess:
+  name: ARLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 128
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4