knowledge_tools.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. from langchain.tools import tool
  2. from typing import List
  3. import requests
  4. import json
  5. import os
  6. from .base_tool import html_to_text, get_unique_match_count
  7. @tool
  8. def get_knowledge_list(filter_words: List[str], match_limit: int = 3) -> str:
  9. """获取知识库列表,返回知识库DocID和标题DocName以及关键字keyword的列表,根据关键字及标题,按用户问题筛选出要用到的文章后, 通过DocID获取文章内容,另外会提供访问文章正文的工具,最终返回的文章数最好在10篇以内
  10. Args:
  11. filter_words: 筛选知识库文章的关键词列表,只要符合其中任意一个关键词,就会返回该文章
  12. match_limit: 筛选知识库文章的匹配计数,默认值为3,即只要符合3个关键词,就会返回该文章。但如果没有符合的文章或数量太少影响作答,可再次调用该工具,将match_limit减1,直到符合文章,但match_limit不能小于1
  13. Returns:
  14. 一个包含知识库DocID、标题DocName和关键字keyword的列表,每个元素为字符串,字符串格式为"DocID:DocName|keyword",如果没有keyword,则格式为"DocID:DocName",每个元素之间用换行符分隔
  15. """
  16. print(f"正在查询知识库列表,筛选关键词:{filter_words} 匹配下限:{match_limit}")
  17. kms_list_url = os.getenv("KMS_LIST_URL")
  18. payload = {
  19. "categorycodeList": [],
  20. "ignoreTypeSub": False,
  21. "ignoreStandardByTopic": True,
  22. }
  23. headers = {
  24. "Accept": "application/json, text/plain, */*",
  25. "Content-Type": "application/json",
  26. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
  27. }
  28. try:
  29. response = requests.post(
  30. kms_list_url, headers=headers, json=payload, timeout=10
  31. )
  32. if response.status_code == 200:
  33. data = response.json()
  34. matched_lines = ""
  35. for doc in data["docList"]:
  36. doc_id = doc["DocID"]
  37. doc_name = doc["DocName"]
  38. doc_keywords = doc["keyword"]
  39. search_text = f"{doc_name} {doc_keywords}".lower()
  40. if not filter_words:
  41. line = (
  42. f"{doc_id}:{doc_name}|{doc_keywords}"
  43. if doc_keywords
  44. else f"{doc_id}:{doc_name}"
  45. )
  46. matched_lines += line + "\n"
  47. else:
  48. match_count = get_unique_match_count(search_text, filter_words)
  49. if match_count >= match_limit:
  50. line = (
  51. f"{doc_id}:{doc_name}|{doc_keywords}"
  52. if doc_keywords
  53. else f"{doc_id}:{doc_name}"
  54. )
  55. matched_lines += line + "\n"
  56. return matched_lines
  57. else:
  58. return f"请求失败,状态码: {response.status_code}"
  59. except Exception as e:
  60. return f"请求异常: {e}"
  61. @tool
  62. def get_knowledge_content(docid: str) -> str:
  63. """获取知识库文章内容
  64. Args:
  65. docid: 知识库文章的DocID
  66. Returns:
  67. 知识库文章内容
  68. """
  69. print(f"正在获取知识库文章内容,DocID: {docid}")
  70. kms_view_url = os.getenv("KMS_VIEW_URL")
  71. headers = {
  72. "Accept": "application/json, text/plain, */*",
  73. "Content-Type": "application/json",
  74. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
  75. }
  76. try:
  77. payload = {"docid": docid}
  78. response = requests.post(
  79. kms_view_url, headers=headers, json=payload, timeout=10
  80. )
  81. if response.status_code == 200:
  82. data = response.json()
  83. doc_html = data.get("DocHtml", "")
  84. plain_text = html_to_text(doc_html)
  85. print(f"已获取到ID: {docid}的文章内容,长度{len(plain_text)}")
  86. return plain_text
  87. else:
  88. return f"请求失败,状态码: {response.status_code}"
  89. except Exception as e:
  90. return f"请求异常: {e}"