llm_parser.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. from langchain_openai import ChatOpenAI
  2. from langchain.messages import HumanMessage, SystemMessage
  3. import json
  4. import re
  5. from typing import Dict, Any
  6. from utils.logger import chat_logger
  7. import os
  8. from config.settings import settings
  9. class LLMParser:
  10. """LLM解析器"""
  11. def __init__(self, llm_config: Dict = None):
  12. self.llm = ChatOpenAI(
  13. model=settings.LLM_MODEL,
  14. temperature=settings.LLM_TEMPERATURE,
  15. api_key=settings.DEEPSEEK_API_KEY,
  16. base_url=settings.DEEPSEEK_BASE_URL,
  17. max_tokens=settings.LLM_MAX_TOKENS,
  18. )
  19. async def parse_to_json(self, ocr_text: str, template) -> Dict[str, Any]:
  20. """使用模板解析OCR文本为JSON"""
  21. print("template.system_prompt:", template.system_prompt)
  22. messages = [
  23. SystemMessage(content=template.system_prompt),
  24. HumanMessage(content=f"请从以下文本中提取信息:\n\n{ocr_text}"),
  25. ]
  26. try:
  27. response = await self.llm.ainvoke(messages)
  28. content = response.content # response.generations[0][0].text
  29. # 提取JSON(处理可能的Markdown格式)
  30. json_match = re.search(r"```json\n(.*?)\n```", content, re.DOTALL)
  31. if json_match:
  32. content = json_match.group(1)
  33. result = json.loads(content)
  34. # 验证结果
  35. if template.validate_result(result):
  36. result = template.post_process(result)
  37. return result
  38. else:
  39. raise ValueError("解析结果验证失败")
  40. except json.JSONDecodeError as e:
  41. chat_logger.error(f"JSON解析失败: {e}, 原始内容: {content}")
  42. raise
  43. except Exception as e:
  44. chat_logger.error(f"LLM解析失败: {e}")
  45. raise