Spaces:

gold-experience-decentrathon
/

bank-doc-extractor

Sleeping

App Files Files Community

bank-doc-extractor / app.py

kitrofimov

Fix OCR scanning

28add3f 3 months ago

raw

history blame contribute delete

3.48 kB

	from transformers import AutoModelForCausalLM, AutoTokenizer
	from pdf2image import convert_from_path
	from paddleocr import PaddleOCR
	import gradio as gr
	import numpy as np

	model_name = "fluently/FluentlyQwen3-1.7B"

	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype="auto",
	device_map="auto"
	)

	ocr = PaddleOCR(
	lang="ru",
	use_textline_orientation=True,
	ocr_version="PP-OCRv5",
	)

	def process(pdf):
	ocr_text = scan(pdf)
	print(f"ocr_text: {ocr_text}")
	output = llm(ocr_text)
	return output

	def scan(pdf):
	if pdf is None:
	return
	pil_images = convert_from_path(pdf, dpi=150)
	lines = []
	for i, image in enumerate(pil_images):
	print(i, len(pil_images))
	results = ocr.predict(np.array(image))
	lines.append(''.join(results[0]["rec_texts"]))
	text = ''.join(lines)
	return text

	def llm(scanned):
	# prepare the model input
	messages = [
	{"role": "system", "content": """
	Ты - система извлечения информации.
	Задача: прочитай текст банковского документа ниже и выведи корректный JSON с извлечёнными данными.

	Правила:

	JSON должен содержать поля: bank_name, document_type, document_number, date, name_payer, name_reciever, payment_reason, payment_amount.
	Не выводи пояснения или комментарии JSON.
	Если в документе отсутствует поле, установи для него значение null.
	Никогда не придумывай значения. Используй текст в точности так, как он написан.
	Текст документа:
	"""},
	{"role": "user", "content": scanned}
	]
	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True,
	enable_thinking=True
	)
	model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

	# conduct text completion
	generated_ids = model.generate(
	**model_inputs,
	max_new_tokens=32768
	)
	output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

	# parsing thinking content
	try:
	# rindex finding 151668 (</think>)
	index = len(output_ids) - output_ids[::-1].index(151668)
	except ValueError:
	index = 0

	thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
	content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

	return content

	with gr.Blocks(title="Inquira") as demo:
	gr.Markdown("# Inquira — извлечение данных из банковских документов")
	with gr.Row():
	with gr.Column(scale=1):
	inp = gr.File(
	label="Загрузите PDF-файл",
	file_types=[".pdf"],
	type="filepath"
	)
	btn = gr.Button("Отправить")

	with gr.Column(scale=2):
	out = gr.Textbox(
	label="Вывод",
	type="text",
	lines=10,
	max_lines=30,
	show_copy_button=True
	)

	btn.click(process, inputs=inp, outputs=out)

	demo.launch()