knowledge_tools.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. from langchain.tools import tool
  2. from typing import List
  3. import requests
  4. import json
  5. import os
  6. from .base_tool import html_to_text, get_unique_match_count
  7. from config.settings import settings
  8. # @tool
  9. # def get_knowledge_list(filter_words: List[str], match_limit: int = 3) -> str:
  10. # """根据关键词筛选知识库文章列表
  11. # Args:
  12. # filter_words: 关键词列表,匹配任一关键词即返回
  13. # match_limit: 最小匹配数(默认3),无结果时可减少重试(最小1)
  14. # Returns:
  15. # 文章列表,格式:每行"DocID:DocName|keyword",用于后续获取内容
  16. # """
  17. # print(f"正在查询知识库列表,筛选关键词:{filter_words} 匹配下限:{match_limit}")
  18. # kms_list_url = settings.KMS_LIST_URL # os.getenv("KMS_LIST_URL")
  19. # payload = {
  20. # "categorycodeList": [],
  21. # "ignoreTypeSub": False,
  22. # "ignoreStandardByTopic": True,
  23. # }
  24. # headers = {
  25. # "Accept": "application/json, text/plain, */*",
  26. # "Content-Type": "application/json",
  27. # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
  28. # }
  29. # try:
  30. # response = requests.post(
  31. # kms_list_url, headers=headers, json=payload, timeout=10
  32. # )
  33. # if response.status_code == 200:
  34. # data = response.json()
  35. # matched_lines = ""
  36. # for doc in data["docList"]:
  37. # doc_id = doc["DocID"]
  38. # doc_name = doc["DocName"]
  39. # doc_keywords = doc["keyword"]
  40. # search_text = f"{doc_name} {doc_keywords}".lower()
  41. # if not filter_words:
  42. # line = (
  43. # f"{doc_id}:{doc_name}|{doc_keywords}"
  44. # if doc_keywords
  45. # else f"{doc_id}:{doc_name}"
  46. # )
  47. # matched_lines += line + "\n"
  48. # else:
  49. # match_count = get_unique_match_count(search_text, filter_words)
  50. # if match_count >= match_limit:
  51. # line = (
  52. # f"{doc_id}:{doc_name}|{doc_keywords}"
  53. # if doc_keywords
  54. # else f"{doc_id}:{doc_name}"
  55. # )
  56. # matched_lines += line + "\n"
  57. # return matched_lines
  58. # else:
  59. # return f"请求失败,状态码: {response.status_code}"
  60. # except Exception as e:
  61. # return f"请求异常: {e}"
  62. # @tool
  63. # def get_knowledge_content(docid: str) -> str:
  64. # """获取知识库文章内容
  65. # Args:
  66. # docid: 知识库文章的DocID
  67. # Returns:
  68. # 知识库文章内容
  69. # """
  70. # print(f"正在获取知识库文章内容,DocID: {docid}")
  71. # kms_view_url = settings.KMS_VIEW_URL # os.getenv("KMS_VIEW_URL")
  72. # headers = {
  73. # "Accept": "application/json, text/plain, */*",
  74. # "Content-Type": "application/json",
  75. # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
  76. # }
  77. # try:
  78. # payload = {"docid": docid}
  79. # response = requests.post(
  80. # kms_view_url, headers=headers, json=payload, timeout=10
  81. # )
  82. # if response.status_code == 200:
  83. # data = response.json()
  84. # doc_html = data.get("DocHtml", "")
  85. # plain_text = html_to_text(doc_html)
  86. # print(f"已获取到ID: {docid}的文章内容,长度{len(plain_text)}")
  87. # return plain_text
  88. # else:
  89. # return f"请求失败,状态码: {response.status_code}"
  90. # except Exception as e:
  91. # return f"请求异常: {e}"
  92. @tool
  93. def search_and_retrieve_knowledge(keywords: List[str], max_matches: int = 5) -> str:
  94. """搜索并获取最相关的知识库文章内容
  95. 此工具会:
  96. 1. 自动搜索知识库文章列表
  97. 2. 根据关键词匹配度排序,选择匹配最多关键词的文章
  98. 3. 自动获取并返回该文章的完整内容
  99. 拆分关键词核心原则:
  100. 最小化原则:将用户问题拆分为最小单位的关键词,最好2个字一个关键词
  101. 例如:"销售订单终止数量失败" → ["销售", "订单", "终止", "数量", "失败"],"销售订单提示没有新建权限怎么办" → ["销售", "订单", "新建", "权限"]
  102. Args:
  103. keywords: 关键词列表,会尽可能细化匹配
  104. max_matches: 最多考虑的匹配文章数(默认5)
  105. Returns:
  106. 最相关文章的完整内容
  107. """
  108. print(f"正在搜索知识库,关键词:{keywords}")
  109. # 1. 获取文章列表
  110. kms_list_url = settings.KMS_LIST_URL
  111. payload = {
  112. "categorycodeList": [],
  113. "ignoreTypeSub": False,
  114. "ignoreStandardByTopic": True,
  115. }
  116. headers = {
  117. "Accept": "application/json, text/plain, */*",
  118. "Content-Type": "application/json",
  119. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
  120. }
  121. try:
  122. # 获取所有文章
  123. response = requests.post(
  124. kms_list_url, headers=headers, json=payload, timeout=10
  125. )
  126. if response.status_code != 200:
  127. return f"获取文章列表失败,状态码: {response.status_code}"
  128. data = response.json()
  129. doc_list = data.get("docList", [])
  130. if not doc_list:
  131. return "知识库中没有找到文章"
  132. # 2. 计算匹配度并排序
  133. matched_docs = []
  134. for doc in doc_list:
  135. doc_id = doc["DocID"]
  136. doc_name = doc["DocName"]
  137. doc_keywords = doc["keyword"]
  138. search_text = f"{doc_name} {doc_keywords}".lower()
  139. # 计算匹配的关键词数量
  140. match_count = get_unique_match_count(search_text, keywords)
  141. if match_count > 0:
  142. matched_docs.append({
  143. "doc_id": doc_id,
  144. "doc_name": doc_name,
  145. "match_count": match_count,
  146. "keywords": doc_keywords
  147. })
  148. if not matched_docs:
  149. return "没有找到与关键词相关的文章"
  150. # 按匹配数量降序排序,选择前max_matches个
  151. matched_docs.sort(key=lambda x: x["match_count"], reverse=True)
  152. top_docs = matched_docs[:max_matches]
  153. print(f"找到 {len(matched_docs)} 篇相关文章,前{len(top_docs)}篇匹配度最高")
  154. for i, doc in enumerate(top_docs):
  155. print(f" {i+1}. {doc['doc_name']} (匹配{doc['match_count']}个关键词)")
  156. # 3. 获取匹配度最高的文章内容
  157. best_doc = top_docs[0]
  158. print(f"选择最相关的文章: {best_doc['doc_name']} (DocID: {best_doc['doc_id']})")
  159. # 4. 获取文章内容
  160. kms_view_url = settings.KMS_VIEW_URL
  161. content_payload = {"docid": best_doc['doc_id']}
  162. content_response = requests.post(
  163. kms_view_url, headers=headers, json=content_payload, timeout=10
  164. )
  165. if content_response.status_code != 200:
  166. return f"获取文章内容失败,状态码: {content_response.status_code}"
  167. content_data = content_response.json()
  168. doc_html = content_data.get("DocHtml", "")
  169. plain_text = html_to_text(doc_html)
  170. # 5. 构建返回结果
  171. result = f"【知识库文章】\n"
  172. result += f"标题: {best_doc['doc_name']}\n"
  173. result += f"匹配关键词数量: {best_doc['match_count']}\n"
  174. if best_doc['keywords']:
  175. result += f"文章关键词: {best_doc['keywords']}\n"
  176. result += f"内容: {plain_text}\n"
  177. print(f"已获取文章内容,长度: {len(plain_text)} 字符")
  178. return result
  179. except Exception as e:
  180. return f"搜索和获取知识库文章时出错: {e}"