from abc import ABC, abstractmethod from typing import Dict, Any from pydantic import BaseModel import json from datetime import datetime from config.template_config_manager import TemplateConfigManager # 新增导入 class DocumentTemplate(ABC): """单据模板基类(支持配置扩展)""" def __init__(self, config_manager: TemplateConfigManager = None): # 新增:配置管理器 self.config_manager = config_manager or TemplateConfigManager() self._template_config = self.config_manager.get_template_config( self.template_name ) @property @abstractmethod def template_name(self) -> str: """模板名称标识""" pass @property @abstractmethod def description(self) -> str: """模板描述""" pass def get_hardcoded_guidance(self) -> Dict[str, Any]: """硬编码的字段指导信息(子类可重写)""" return {"field_guidance": {}, "additional_rules": ""} # 项目初期为空 @property def system_prompt(self) -> str: """系统提示词(支持配置扩展)""" # 获取硬编码指导 hardcoded_guidance = self.get_hardcoded_guidance() hardcoded_field_guidance = hardcoded_guidance.get("field_guidance", {}) hardcoded_additional_rules = hardcoded_guidance.get("additional_rules", "") # 获取配置指导(已自动过滤空白值) configured_field_guidance = self._template_config.get("field_guidance", {}) configured_additional_rules = self._template_config.get("additional_rules", "") # 合并字段指导信息 merged_field_guidance = self._merge_field_guidance( hardcoded_field_guidance, configured_field_guidance ) base_prompt = f"""你是一个专业的单据信息提取助手。现在时间是{datetime.now().isoformat()} 请从OCR识别结果或用户的输入中提取信息,并严格按照以下JSON格式返回: {json.dumps(self.output_schema(), indent=2, ensure_ascii=False)} 提取规则: {self.extraction_rules()}""" # 添加合并后的指导信息(仅在存在内容时添加) extended_prompt = self._build_extended_guidance( merged_field_guidance, hardcoded_additional_rules, configured_additional_rules, ) if extended_prompt: base_prompt += f"\n\n额外指导信息:\n{extended_prompt}" base_prompt += """ 请确保: 1. 只提取明确存在的字段 2. 日期格式统一为YYYY-MM-DD 3. 数字类型保持原样""" return base_prompt def _merge_field_guidance( self, hardcoded: Dict[str, list], configured: Dict[str, list] ) -> Dict[str, list]: """合并硬编码和配置的字段指导信息(自动去重)""" merged = {} all_fields = set(hardcoded.keys()) | set(configured.keys()) for field in all_fields: hardcoded_hints = hardcoded.get(field, []) configured_hints = configured.get(field, []) # 合并并去重,保持顺序 combined = [] seen = set() for hint in hardcoded_hints + configured_hints: if hint and hint not in seen: # 过滤空值并去重 combined.append(hint) seen.add(hint) if combined: # 只添加有内容的字段 merged[field] = combined return merged def _build_extended_guidance( self, field_guidance: Dict[str, list], hardcoded_rules: str, configured_rules: str, ) -> str: """构建扩展指导信息""" guidance_parts = [] # 字段指导信息(仅在存在内容时添加) if field_guidance: guidance_parts.append("字段识别指导:") for field_name, hints in field_guidance.items(): if hints: combined_hints = "; ".join(hints) guidance_parts.append(f"- {field_name}: {combined_hints}") # 合并额外规则(过滤空值) additional_rules = [] if hardcoded_rules and hardcoded_rules.strip(): additional_rules.append(hardcoded_rules) if configured_rules and configured_rules.strip(): additional_rules.append(configured_rules) if additional_rules: guidance_parts.append(f"特殊规则: {'; '.join(additional_rules)}") return "\n".join(guidance_parts) if guidance_parts else "" @abstractmethod def output_schema(self) -> Dict[str, Any]: """返回JSON输出结构定义""" pass @abstractmethod def extraction_rules(self) -> str: """字段提取规则说明""" pass def validate_result(self, result: Dict) -> bool: """验证解析结果""" return True def post_process(self, result: Dict) -> Dict: """后处理钩子""" result["_metadata"] = { "template": self.template_name, "processed_at": datetime.now().isoformat(), "template_description": self.description, } return result