from typing import Dict, Any, List, Optional from core.ocr_service import PaddleOCRService from .llm_parser import LLMParser from .templates.template_registry import TemplateRegistry import os from utils.logger import chat_logger class DocumentProcessingService: """单据处理服务(简化版)""" def __init__(self, ocr_service: PaddleOCRService = None): # 依赖注入OCR服务 self.ocr_service = ocr_service or self._create_default_ocr_service() self.llm_parser = LLMParser() def _create_default_ocr_service(self) -> PaddleOCRService: """创建默认OCR服务""" return PaddleOCRService( api_url=os.getenv("PADDLE_OCR_API_URL", ""), token=os.getenv("PADDLE_OCR_TOKEN", ""), ) async def ocr_create_bill( self, image_bytes: bytes, document_type: str = None, ocr_options: Dict = None ) -> Dict[str, Any]: """ 通过扫描图片辅助建立单据 Args: image_bytes: 图片字节 document_type: 单据类型,如'invoice' ocr_options: OCR选项 Returns: 处理结果 """ # 1. OCR识别 ocr_result = await self.ocr_service.recognize_image_async( image_bytes, **(ocr_options or {}) ) # 2. 提取文本 ocr_text = self.ocr_service.extract_text_from_result(ocr_result) if not ocr_text or not ocr_text.strip(): chat_logger.warning("OCR未识别到有效文本") ocr_text = "" # 3. 如果没有指定类型,尝试自动检测 if not document_type and ocr_text: document_type = await self._detect_document_type(ocr_text) # 4. 如果有单据类型,用LLM解析 parsed_data = None if document_type and ocr_text: template = TemplateRegistry.get_template(document_type) parsed_data = await self.llm_parser.parse_to_json(ocr_text, template) # 5. 返回结果 result = { "success": parsed_data is not None, "ocr_raw": ocr_result, # 原始API返回 "ocr_text": ocr_text, # 格式化文本 } if parsed_data: result.update( { "document_type": document_type, "parsed_data": parsed_data, "has_parsed_data": True, } ) return result async def pure_ocr( self, image_bytes: bytes, file_type: int = 1, ocr_options: Dict = None ) -> Dict[str, Any]: """ 扫描图片并返回OCR识别结果 Args: image_bytes: 图片字节 file_type: 文件类型,1: 图片, 0: PDF ocr_options: OCR选项 Returns: 处理结果 """ # 1. OCR识别 ocr_result = await self.ocr_service.recognize_image_async( image_bytes, file_type, **(ocr_options or {}) ) # 2. 提取文本 ocr_text = self.ocr_service.extract_text_from_result(ocr_result) if not ocr_text or not ocr_text.strip(): chat_logger.warning("OCR未识别到有效文本") ocr_text = "" result = { "ocr_text": ocr_text, # 格式化文本 } return result async def message_create_bill( self, message: str, document_type: str = None ) -> Dict[str, Any]: """ 通过文本消息辅助建立单据 Args: message: 文本消息 document_type: 单据类型,如'invoice' Returns: 处理结果 """ print("message:", message) print("document_type:", document_type) template = TemplateRegistry.get_template(document_type) parsed_data = await self.llm_parser.parse_to_json(message, template) print("parsed_data:", parsed_data) # 5. 返回结果 result = {"success": parsed_data is not None} if parsed_data: result.update( { "document_type": document_type, "parsed_data": parsed_data, "has_parsed_data": True, } ) print("result:", result) return result async def _detect_document_type(self, ocr_text: str) -> Optional[str]: """简单关键词检测单据类型""" text_lower = ocr_text.lower() if any(keyword in text_lower for keyword in ["发票", "增值税", "专用发票"]): return "invoice" elif any(keyword in text_lower for keyword in ["收据", "收款", "收条"]): return "receipt" elif any(keyword in text_lower for keyword in ["订单", "订货单", "采购单"]): return "order" return None async def get_available_templates(self) -> List[Dict]: """获取可用模板列表""" templates = TemplateRegistry.list_templates() return [ { "name": name, "description": desc, "fields": TemplateRegistry.get_template(name).output_schema(), } for name, desc in templates.items() ]