| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153 |
- from abc import ABC, abstractmethod
- from typing import Dict, Any
- from pydantic import BaseModel
- import json
- from datetime import datetime
- from config.template_config_manager import TemplateConfigManager # 新增导入
- class DocumentTemplate(ABC):
- """单据模板基类(支持配置扩展)"""
- def __init__(self, config_manager: TemplateConfigManager = None):
- # 新增:配置管理器
- self.config_manager = config_manager or TemplateConfigManager()
- self._template_config = self.config_manager.get_template_config(
- self.template_name
- )
- @property
- @abstractmethod
- def template_name(self) -> str:
- """模板名称标识"""
- pass
- @property
- @abstractmethod
- def description(self) -> str:
- """模板描述"""
- pass
- def get_hardcoded_guidance(self) -> Dict[str, Any]:
- """硬编码的字段指导信息(子类可重写)"""
- return {"field_guidance": {}, "additional_rules": ""} # 项目初期为空
- @property
- def system_prompt(self) -> str:
- """系统提示词(支持配置扩展)"""
- # 获取硬编码指导
- hardcoded_guidance = self.get_hardcoded_guidance()
- hardcoded_field_guidance = hardcoded_guidance.get("field_guidance", {})
- hardcoded_additional_rules = hardcoded_guidance.get("additional_rules", "")
- # 获取配置指导(已自动过滤空白值)
- configured_field_guidance = self._template_config.get("field_guidance", {})
- configured_additional_rules = self._template_config.get("additional_rules", "")
- # 合并字段指导信息
- merged_field_guidance = self._merge_field_guidance(
- hardcoded_field_guidance, configured_field_guidance
- )
- base_prompt = f"""你是一个专业的单据信息提取助手。现在时间是{datetime.now().isoformat()}
- 请从OCR识别结果或用户的输入中提取信息,并严格按照以下JSON格式返回:
- {json.dumps(self.output_schema(), indent=2, ensure_ascii=False)}
- 提取规则:
- {self.extraction_rules()}"""
- # 添加合并后的指导信息(仅在存在内容时添加)
- extended_prompt = self._build_extended_guidance(
- merged_field_guidance,
- hardcoded_additional_rules,
- configured_additional_rules,
- )
- if extended_prompt:
- base_prompt += f"\n\n额外指导信息:\n{extended_prompt}"
- base_prompt += """
- 请确保:
- 1. 只提取明确存在的字段
- 2. 日期格式统一为YYYY-MM-DD
- 3. 数字类型保持原样"""
- return base_prompt
- def _merge_field_guidance(
- self, hardcoded: Dict[str, list], configured: Dict[str, list]
- ) -> Dict[str, list]:
- """合并硬编码和配置的字段指导信息(自动去重)"""
- merged = {}
- all_fields = set(hardcoded.keys()) | set(configured.keys())
- for field in all_fields:
- hardcoded_hints = hardcoded.get(field, [])
- configured_hints = configured.get(field, [])
- # 合并并去重,保持顺序
- combined = []
- seen = set()
- for hint in hardcoded_hints + configured_hints:
- if hint and hint not in seen: # 过滤空值并去重
- combined.append(hint)
- seen.add(hint)
- if combined: # 只添加有内容的字段
- merged[field] = combined
- return merged
- def _build_extended_guidance(
- self,
- field_guidance: Dict[str, list],
- hardcoded_rules: str,
- configured_rules: str,
- ) -> str:
- """构建扩展指导信息"""
- guidance_parts = []
- # 字段指导信息(仅在存在内容时添加)
- if field_guidance:
- guidance_parts.append("字段识别指导:")
- for field_name, hints in field_guidance.items():
- if hints:
- combined_hints = "; ".join(hints)
- guidance_parts.append(f"- {field_name}: {combined_hints}")
- # 合并额外规则(过滤空值)
- additional_rules = []
- if hardcoded_rules and hardcoded_rules.strip():
- additional_rules.append(hardcoded_rules)
- if configured_rules and configured_rules.strip():
- additional_rules.append(configured_rules)
- if additional_rules:
- guidance_parts.append(f"特殊规则: {'; '.join(additional_rules)}")
- return "\n".join(guidance_parts) if guidance_parts else ""
- @abstractmethod
- def output_schema(self) -> Dict[str, Any]:
- """返回JSON输出结构定义"""
- pass
- @abstractmethod
- def extraction_rules(self) -> str:
- """字段提取规则说明"""
- pass
- def validate_result(self, result: Dict) -> bool:
- """验证解析结果"""
- return True
- def post_process(self, result: Dict) -> Dict:
- """后处理钩子"""
- result["_metadata"] = {
- "template": self.template_name,
- "processed_at": datetime.now().isoformat(),
- "template_description": self.description,
- }
- return result
|