longjoedyy
/
LongjoeAgent


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
							from typing import Dict, Any, List, Optional
from core.ocr_service import PaddleOCRService
from .llm_parser import LLMParser
from .templates.template_registry import TemplateRegistry
import os
from utils.logger import chat_logger


class DocumentProcessingService:
    """单据处理服务（简化版）"""

    def __init__(self, ocr_service: PaddleOCRService = None):
        # 依赖注入OCR服务
        self.ocr_service = ocr_service or self._create_default_ocr_service()
        self.llm_parser = LLMParser()

    def _create_default_ocr_service(self) -> PaddleOCRService:
        """创建默认OCR服务"""
        return PaddleOCRService(
            api_url=os.getenv("PADDLE_OCR_API_URL", ""),
            token=os.getenv("PADDLE_OCR_TOKEN", ""),
        )

    async def ocr_create_bill(
        self, image_bytes: bytes, document_type: str = None, ocr_options: Dict = None
    ) -> Dict[str, Any]:
        """
        通过扫描图片辅助建立单据

        Args:
            image_bytes: 图片字节
            document_type: 单据类型，如'invoice'
            ocr_options: OCR选项

        Returns:
            处理结果
        """
        # 1. OCR识别
        ocr_result = await self.ocr_service.recognize_image_async(
            image_bytes, **(ocr_options or {})
        )

        # 2. 提取文本
        ocr_text = self.ocr_service.extract_text_from_result(ocr_result)

        if not ocr_text or not ocr_text.strip():
            chat_logger.warning("OCR未识别到有效文本")
            ocr_text = ""

        # 3. 如果没有指定类型，尝试自动检测
        if not document_type and ocr_text:
            document_type = await self._detect_document_type(ocr_text)

        # 4. 如果有单据类型，用LLM解析
        parsed_data = None
        if document_type and ocr_text:
            template = TemplateRegistry.get_template(document_type)
            parsed_data = await self.llm_parser.parse_to_json(ocr_text, template)

        # 5. 返回结果
        result = {
            "success": parsed_data is not None,
            "ocr_raw": ocr_result,  # 原始API返回
            "ocr_text": ocr_text,  # 格式化文本
        }

        if parsed_data:
            result.update(
                {
                    "document_type": document_type,
                    "parsed_data": parsed_data,
                    "has_parsed_data": True,
                }
            )

        return result

    async def pure_ocr(
        self, image_bytes: bytes, file_type: int = 1, ocr_options: Dict = None
    ) -> Dict[str, Any]:
        """
        扫描图片并返回OCR识别结果

        Args:
            image_bytes: 图片字节
            file_type: 文件类型，1: 图片, 0: PDF
            ocr_options: OCR选项

        Returns:
            处理结果
        """
        # 1. OCR识别
        ocr_result = await self.ocr_service.recognize_image_async(
            image_bytes, file_type, **(ocr_options or {})
        )

        # 2. 提取文本
        ocr_text = self.ocr_service.extract_text_from_result(ocr_result)

        if not ocr_text or not ocr_text.strip():
            chat_logger.warning("OCR未识别到有效文本")
            ocr_text = ""

        result = {
            "ocr_text": ocr_text,  # 格式化文本
        }

        return result

    async def message_create_bill(
        self, message: str, document_type: str = None
    ) -> Dict[str, Any]:
        """
        通过文本消息辅助建立单据

        Args:
            message: 文本消息
            document_type: 单据类型，如'invoice'

        Returns:
            处理结果
        """
        print("message:", message)
        print("document_type:", document_type)
        template = TemplateRegistry.get_template(document_type)
        parsed_data = await self.llm_parser.parse_to_json(message, template)
        print("parsed_data:", parsed_data)

        # 5. 返回结果
        result = {"success": parsed_data is not None}

        if parsed_data:
            result.update(
                {
                    "document_type": document_type,
                    "parsed_data": parsed_data,
                    "has_parsed_data": True,
                }
            )
        print("result:", result)
        return result

    async def _detect_document_type(self, ocr_text: str) -> Optional[str]:
        """简单关键词检测单据类型"""
        text_lower = ocr_text.lower()

        if any(keyword in text_lower for keyword in ["发票", "增值税", "专用发票"]):
            return "invoice"
        elif any(keyword in text_lower for keyword in ["收据", "收款", "收条"]):
            return "receipt"
        elif any(keyword in text_lower for keyword in ["订单", "订货单", "采购单"]):
            return "order"

        return None

    async def get_available_templates(self) -> List[Dict]:
        """获取可用模板列表"""
        templates = TemplateRegistry.list_templates()
        return [
            {
                "name": name,
                "description": desc,
                "fields": TemplateRegistry.get_template(name).output_schema(),
            }
            for name, desc in templates.items()
        ]