| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166 |
- from typing import Dict, Any, List, Optional
- from core.ocr_service import PaddleOCRService
- from .llm_parser import LLMParser
- from .templates.template_registry import TemplateRegistry
- import os
- from utils.logger import chat_logger
- class DocumentProcessingService:
- """单据处理服务(简化版)"""
- def __init__(self, ocr_service: PaddleOCRService = None):
- # 依赖注入OCR服务
- self.ocr_service = ocr_service or self._create_default_ocr_service()
- self.llm_parser = LLMParser()
- def _create_default_ocr_service(self) -> PaddleOCRService:
- """创建默认OCR服务"""
- return PaddleOCRService(
- api_url=os.getenv("PADDLE_OCR_API_URL", ""),
- token=os.getenv("PADDLE_OCR_TOKEN", ""),
- )
- async def ocr_create_bill(
- self, image_bytes: bytes, document_type: str = None, ocr_options: Dict = None
- ) -> Dict[str, Any]:
- """
- 通过扫描图片辅助建立单据
- Args:
- image_bytes: 图片字节
- document_type: 单据类型,如'invoice'
- ocr_options: OCR选项
- Returns:
- 处理结果
- """
- # 1. OCR识别
- ocr_result = await self.ocr_service.recognize_image_async(
- image_bytes, **(ocr_options or {})
- )
- # 2. 提取文本
- ocr_text = self.ocr_service.extract_text_from_result(ocr_result)
- if not ocr_text or not ocr_text.strip():
- chat_logger.warning("OCR未识别到有效文本")
- ocr_text = ""
- # 3. 如果没有指定类型,尝试自动检测
- if not document_type and ocr_text:
- document_type = await self._detect_document_type(ocr_text)
- # 4. 如果有单据类型,用LLM解析
- parsed_data = None
- if document_type and ocr_text:
- template = TemplateRegistry.get_template(document_type)
- parsed_data = await self.llm_parser.parse_to_json(ocr_text, template)
- # 5. 返回结果
- result = {
- "success": parsed_data is not None,
- "ocr_raw": ocr_result, # 原始API返回
- "ocr_text": ocr_text, # 格式化文本
- }
- if parsed_data:
- result.update(
- {
- "document_type": document_type,
- "parsed_data": parsed_data,
- "has_parsed_data": True,
- }
- )
- return result
- async def pure_ocr(
- self, image_bytes: bytes, file_type: int = 1, ocr_options: Dict = None
- ) -> Dict[str, Any]:
- """
- 扫描图片并返回OCR识别结果
- Args:
- image_bytes: 图片字节
- file_type: 文件类型,1: 图片, 0: PDF
- ocr_options: OCR选项
- Returns:
- 处理结果
- """
- # 1. OCR识别
- ocr_result = await self.ocr_service.recognize_image_async(
- image_bytes, file_type, **(ocr_options or {})
- )
- # 2. 提取文本
- ocr_text = self.ocr_service.extract_text_from_result(ocr_result)
- if not ocr_text or not ocr_text.strip():
- chat_logger.warning("OCR未识别到有效文本")
- ocr_text = ""
- result = {
- "ocr_text": ocr_text, # 格式化文本
- }
- return result
- async def message_create_bill(
- self, message: str, document_type: str = None
- ) -> Dict[str, Any]:
- """
- 通过文本消息辅助建立单据
- Args:
- message: 文本消息
- document_type: 单据类型,如'invoice'
- Returns:
- 处理结果
- """
- print("message:", message)
- print("document_type:", document_type)
- template = TemplateRegistry.get_template(document_type)
- parsed_data = await self.llm_parser.parse_to_json(message, template)
- print("parsed_data:", parsed_data)
- # 5. 返回结果
- result = {"success": parsed_data is not None}
- if parsed_data:
- result.update(
- {
- "document_type": document_type,
- "parsed_data": parsed_data,
- "has_parsed_data": True,
- }
- )
- print("result:", result)
- return result
- async def _detect_document_type(self, ocr_text: str) -> Optional[str]:
- """简单关键词检测单据类型"""
- text_lower = ocr_text.lower()
- if any(keyword in text_lower for keyword in ["发票", "增值税", "专用发票"]):
- return "invoice"
- elif any(keyword in text_lower for keyword in ["收据", "收款", "收条"]):
- return "receipt"
- elif any(keyword in text_lower for keyword in ["订单", "订货单", "采购单"]):
- return "order"
- return None
- async def get_available_templates(self) -> List[Dict]:
- """获取可用模板列表"""
- templates = TemplateRegistry.list_templates()
- return [
- {
- "name": name,
- "description": desc,
- "fields": TemplateRegistry.get_template(name).output_schema(),
- }
- for name, desc in templates.items()
- ]
|