|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
from pdf2image import convert_from_path |
|
|
from paddleocr import PaddleOCR |
|
|
import gradio as gr |
|
|
import numpy as np |
|
|
|
|
|
model_name = "fluently/FluentlyQwen3-1.7B" |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_name, |
|
|
torch_dtype="auto", |
|
|
device_map="auto" |
|
|
) |
|
|
|
|
|
ocr = PaddleOCR( |
|
|
lang="ru", |
|
|
use_textline_orientation=True, |
|
|
ocr_version="PP-OCRv5", |
|
|
) |
|
|
|
|
|
def process(pdf): |
|
|
ocr_text = scan(pdf) |
|
|
print(f"ocr_text: {ocr_text}") |
|
|
output = llm(ocr_text) |
|
|
return output |
|
|
|
|
|
def scan(pdf): |
|
|
if pdf is None: |
|
|
return |
|
|
pil_images = convert_from_path(pdf, dpi=150) |
|
|
lines = [] |
|
|
for i, image in enumerate(pil_images): |
|
|
print(i, len(pil_images)) |
|
|
results = ocr.predict(np.array(image)) |
|
|
lines.append(''.join(results[0]["rec_texts"])) |
|
|
text = ''.join(lines) |
|
|
return text |
|
|
|
|
|
def llm(scanned): |
|
|
|
|
|
messages = [ |
|
|
{"role": "system", "content": """ |
|
|
Ты - система извлечения информации. |
|
|
Задача: прочитай текст банковского документа ниже и выведи корректный JSON с извлечёнными данными. |
|
|
|
|
|
Правила: |
|
|
|
|
|
JSON должен содержать поля: bank_name, document_type, document_number, date, name_payer, name_reciever, payment_reason, payment_amount. |
|
|
Не выводи пояснения или комментарии JSON. |
|
|
Если в документе отсутствует поле, установи для него значение null. |
|
|
Никогда не придумывай значения. Используй текст в точности так, как он написан. |
|
|
Текст документа: |
|
|
"""}, |
|
|
{"role": "user", "content": scanned} |
|
|
] |
|
|
text = tokenizer.apply_chat_template( |
|
|
messages, |
|
|
tokenize=False, |
|
|
add_generation_prompt=True, |
|
|
enable_thinking=True |
|
|
) |
|
|
model_inputs = tokenizer([text], return_tensors="pt").to(model.device) |
|
|
|
|
|
|
|
|
generated_ids = model.generate( |
|
|
**model_inputs, |
|
|
max_new_tokens=32768 |
|
|
) |
|
|
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
index = len(output_ids) - output_ids[::-1].index(151668) |
|
|
except ValueError: |
|
|
index = 0 |
|
|
|
|
|
thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n") |
|
|
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n") |
|
|
|
|
|
return content |
|
|
|
|
|
with gr.Blocks(title="Inquira") as demo: |
|
|
gr.Markdown("# Inquira — извлечение данных из банковских документов") |
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
inp = gr.File( |
|
|
label="Загрузите PDF-файл", |
|
|
file_types=[".pdf"], |
|
|
type="filepath" |
|
|
) |
|
|
btn = gr.Button("Отправить") |
|
|
|
|
|
with gr.Column(scale=2): |
|
|
out = gr.Textbox( |
|
|
label="Вывод", |
|
|
type="text", |
|
|
lines=10, |
|
|
max_lines=30, |
|
|
show_copy_button=True |
|
|
) |
|
|
|
|
|
btn.click(process, inputs=inp, outputs=out) |
|
|
|
|
|
demo.launch() |
|
|
|
|
|
|