document_service.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. from typing import Dict, Any, List, Optional
  2. from core.ocr_service import PaddleOCRService
  3. from .llm_parser import LLMParser
  4. from .templates.template_registry import TemplateRegistry
  5. import os
  6. from utils.logger import chat_logger
  7. class DocumentProcessingService:
  8. """单据处理服务(简化版)"""
  9. def __init__(self, ocr_service: PaddleOCRService = None):
  10. # 依赖注入OCR服务
  11. self.ocr_service = ocr_service or self._create_default_ocr_service()
  12. self.llm_parser = LLMParser()
  13. def _create_default_ocr_service(self) -> PaddleOCRService:
  14. """创建默认OCR服务"""
  15. return PaddleOCRService(
  16. api_url=os.getenv("PADDLE_OCR_API_URL", ""),
  17. token=os.getenv("PADDLE_OCR_TOKEN", ""),
  18. )
  19. async def ocr_create_bill(
  20. self, image_bytes: bytes, document_type: str = None, ocr_options: Dict = None
  21. ) -> Dict[str, Any]:
  22. """
  23. 通过扫描图片辅助建立单据
  24. Args:
  25. image_bytes: 图片字节
  26. document_type: 单据类型,如'invoice'
  27. ocr_options: OCR选项
  28. Returns:
  29. 处理结果
  30. """
  31. # 1. OCR识别
  32. ocr_result = await self.ocr_service.recognize_image_async(
  33. image_bytes, **(ocr_options or {})
  34. )
  35. # 2. 提取文本
  36. ocr_text = self.ocr_service.extract_text_from_result(ocr_result)
  37. if not ocr_text or not ocr_text.strip():
  38. chat_logger.warning("OCR未识别到有效文本")
  39. ocr_text = ""
  40. # 3. 如果没有指定类型,尝试自动检测
  41. if not document_type and ocr_text:
  42. document_type = await self._detect_document_type(ocr_text)
  43. # 4. 如果有单据类型,用LLM解析
  44. parsed_data = None
  45. if document_type and ocr_text:
  46. template = TemplateRegistry.get_template(document_type)
  47. parsed_data = await self.llm_parser.parse_to_json(ocr_text, template)
  48. # 5. 返回结果
  49. result = {
  50. "success": parsed_data is not None,
  51. "ocr_raw": ocr_result, # 原始API返回
  52. "ocr_text": ocr_text, # 格式化文本
  53. }
  54. if parsed_data:
  55. result.update(
  56. {
  57. "document_type": document_type,
  58. "parsed_data": parsed_data,
  59. "has_parsed_data": True,
  60. }
  61. )
  62. return result
  63. async def pure_ocr(
  64. self, image_bytes: bytes, file_type: int = 1, ocr_options: Dict = None
  65. ) -> Dict[str, Any]:
  66. """
  67. 扫描图片并返回OCR识别结果
  68. Args:
  69. image_bytes: 图片字节
  70. file_type: 文件类型,1: 图片, 0: PDF
  71. ocr_options: OCR选项
  72. Returns:
  73. 处理结果
  74. """
  75. # 1. OCR识别
  76. ocr_result = await self.ocr_service.recognize_image_async(
  77. image_bytes, file_type, **(ocr_options or {})
  78. )
  79. # 2. 提取文本
  80. ocr_text = self.ocr_service.extract_text_from_result(ocr_result)
  81. if not ocr_text or not ocr_text.strip():
  82. chat_logger.warning("OCR未识别到有效文本")
  83. ocr_text = ""
  84. result = {
  85. "ocr_text": ocr_text, # 格式化文本
  86. }
  87. return result
  88. async def message_create_bill(
  89. self, message: str, document_type: str = None
  90. ) -> Dict[str, Any]:
  91. """
  92. 通过文本消息辅助建立单据
  93. Args:
  94. message: 文本消息
  95. document_type: 单据类型,如'invoice'
  96. Returns:
  97. 处理结果
  98. """
  99. print("message:", message)
  100. print("document_type:", document_type)
  101. template = TemplateRegistry.get_template(document_type)
  102. parsed_data = await self.llm_parser.parse_to_json(message, template)
  103. print("parsed_data:", parsed_data)
  104. # 5. 返回结果
  105. result = {"success": parsed_data is not None}
  106. if parsed_data:
  107. result.update(
  108. {
  109. "document_type": document_type,
  110. "parsed_data": parsed_data,
  111. "has_parsed_data": True,
  112. }
  113. )
  114. print("result:", result)
  115. return result
  116. async def _detect_document_type(self, ocr_text: str) -> Optional[str]:
  117. """简单关键词检测单据类型"""
  118. text_lower = ocr_text.lower()
  119. if any(keyword in text_lower for keyword in ["发票", "增值税", "专用发票"]):
  120. return "invoice"
  121. elif any(keyword in text_lower for keyword in ["收据", "收款", "收条"]):
  122. return "receipt"
  123. elif any(keyword in text_lower for keyword in ["订单", "订货单", "采购单"]):
  124. return "order"
  125. return None
  126. async def get_available_templates(self) -> List[Dict]:
  127. """获取可用模板列表"""
  128. templates = TemplateRegistry.list_templates()
  129. return [
  130. {
  131. "name": name,
  132. "description": desc,
  133. "fields": TemplateRegistry.get_template(name).output_schema(),
  134. }
  135. for name, desc in templates.items()
  136. ]