knowledge_tools.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. from langchain.tools import tool
  2. from typing import List
  3. import requests
  4. import json
  5. import os
  6. from .base_tool import (
  7. html_to_text,
  8. get_unique_match_count,
  9. calculate_relevance_score,
  10. find_most_relevant_document,
  11. )
  12. from config.settings import settings
  13. @tool
  14. def get_knowledge_list(filter_words: List[str], match_limit: int = 3) -> str:
  15. """根据关键词筛选知识库文章列表
  16. 拆分关键词核心原则:
  17. # 最小化原则:将用户问题拆分为最小单位的关键词,最好2个字一个关键词
  18. # 例如:"销售订单终止数量失败" → ["销售", "订单", "终止", "数量", "失败"],"销售订单提示没有新建权限怎么办" → ["销售", "订单", "新建", "权限"]
  19. Args:
  20. filter_words: 关键词列表,匹配任一关键词即返回
  21. match_limit: 最小匹配数(默认3),无结果时可减少重试(最小1)
  22. Returns:
  23. 文章列表,格式:每行"DocID:DocName|keyword",用于后续获取内容
  24. """
  25. print(f"正在查询知识库列表,筛选关键词:{filter_words} 匹配下限:{match_limit}")
  26. kms_list_url = settings.KMS_LIST_URL # os.getenv("KMS_LIST_URL")
  27. payload = {
  28. "categorycodeList": [],
  29. "ignoreTypeSub": False,
  30. "ignoreStandardByTopic": True,
  31. }
  32. headers = {
  33. "Accept": "application/json, text/plain, */*",
  34. "Content-Type": "application/json",
  35. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
  36. }
  37. try:
  38. response = requests.post(
  39. kms_list_url, headers=headers, json=payload, timeout=10
  40. )
  41. if response.status_code == 200:
  42. data = response.json()
  43. matched_lines = ""
  44. for doc in data["docList"]:
  45. doc_id = doc["DocID"]
  46. doc_name = doc["DocName"]
  47. doc_keywords = doc["keyword"]
  48. search_text = f"{doc_name} {doc_keywords}".lower()
  49. if not filter_words:
  50. line = (
  51. f"{doc_id}:{doc_name}|{doc_keywords}"
  52. if doc_keywords
  53. else f"{doc_id}:{doc_name}"
  54. )
  55. matched_lines += line + "\n"
  56. else:
  57. match_count = get_unique_match_count(search_text, filter_words)
  58. if match_count >= match_limit:
  59. line = (
  60. f"{doc_id}:{doc_name}|{doc_keywords}"
  61. if doc_keywords
  62. else f"{doc_id}:{doc_name}"
  63. )
  64. matched_lines += line + "\n"
  65. return matched_lines
  66. else:
  67. return f"请求失败,状态码: {response.status_code}"
  68. except Exception as e:
  69. return f"请求异常: {e}"
  70. @tool
  71. def get_knowledge_content(docid: str) -> str:
  72. """获取知识库文章内容
  73. Args:
  74. docid: 知识库文章的DocID
  75. Returns:
  76. 知识库文章内容
  77. """
  78. print(f"正在获取知识库文章内容,DocID: {docid}")
  79. kms_view_url = settings.KMS_VIEW_URL # os.getenv("KMS_VIEW_URL")
  80. headers = {
  81. "Accept": "application/json, text/plain, */*",
  82. "Content-Type": "application/json",
  83. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
  84. }
  85. try:
  86. payload = {"docid": docid}
  87. response = requests.post(
  88. kms_view_url, headers=headers, json=payload, timeout=10
  89. )
  90. if response.status_code == 200:
  91. data = response.json()
  92. doc_html = data.get("DocHtml", "")
  93. plain_text = html_to_text(doc_html)
  94. print(f"已获取到ID: {docid}的文章内容,长度{len(plain_text)}")
  95. return plain_text
  96. else:
  97. return f"请求失败,状态码: {response.status_code}"
  98. except Exception as e:
  99. return f"请求异常: {e}"
  100. # @tool
  101. # def search_and_retrieve_knowledge(keywords: List[str], max_matches: int = 10) -> str:
  102. # """搜索并获取最相关的知识库文章内容
  103. # 此工具会:
  104. # 1. 自动搜索知识库文章列表
  105. # 2. 使用改进的智能算法根据关键词匹配度排序,选择最相关的文章
  106. # 3. 自动获取并返回该文章的完整内容
  107. # 拆分关键词核心原则:
  108. # 最小化原则:将用户问题拆分为最小单位的关键词,最好2个字一个关键词
  109. # 例如:"销售订单终止数量失败" → ["销售", "订单", "终止", "数量", "失败"],"销售订单提示没有新建权限怎么办" → ["销售", "订单", "新建", "权限"]
  110. # Args:
  111. # keywords: 关键词列表,会尽可能细化匹配
  112. # max_matches: 最多考虑的匹配文章数(默认10)
  113. # Returns:
  114. # 最相关文章的完整内容
  115. # """
  116. # print(f"正在搜索知识库,关键词:{keywords}")
  117. # # 1. 获取文章列表
  118. # kms_list_url = settings.KMS_LIST_URL
  119. # payload = {
  120. # "categorycodeList": [],
  121. # "ignoreTypeSub": False,
  122. # "ignoreStandardByTopic": True,
  123. # }
  124. # headers = {
  125. # "Accept": "application/json, text/plain, */*",
  126. # "Content-Type": "application/json",
  127. # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
  128. # }
  129. # try:
  130. # # 获取所有文章
  131. # response = requests.post(
  132. # kms_list_url, headers=headers, json=payload, timeout=10
  133. # )
  134. # if response.status_code != 200:
  135. # return f"获取文章列表失败,状态码: {response.status_code}"
  136. # data = response.json()
  137. # doc_list = data.get("docList", [])
  138. # if not doc_list:
  139. # return "知识库中没有找到文章"
  140. # # 打印所有文章标题用于调试
  141. # print(f"知识库中共有 {len(doc_list)} 篇文章")
  142. # print("所有文章标题:")
  143. # for i, doc in enumerate(doc_list[:20]): # 只显示前20个
  144. # print(f" {i+1}. {doc['DocName']}")
  145. # # 2. 使用改进的算法找到最相关的文档
  146. # relevant_docs = find_most_relevant_document(doc_list, keywords, max_matches)
  147. # if not relevant_docs:
  148. # # 如果没有找到匹配的文档,尝试使用更宽松的匹配
  149. # print("使用宽松匹配重新搜索...")
  150. # # 只保留核心关键词重新搜索
  151. # core_keywords = [kw for kw in keywords if len(kw) >= 2]
  152. # if core_keywords:
  153. # relevant_docs = find_most_relevant_document(
  154. # doc_list, core_keywords, max_matches
  155. # )
  156. # if not relevant_docs:
  157. # return "没有找到与关键词相关的文章"
  158. # print(f"找到 {len(relevant_docs)} 篇相关文章,按相关性排序")
  159. # for i, doc in enumerate(relevant_docs):
  160. # print(
  161. # f" {i+1}. {doc['doc_name']} (得分:{doc['relevance_score']:.2f}, 匹配{doc['match_count']}个关键词)"
  162. # )
  163. # # 3. 获取最相关的文章内容
  164. # best_doc = relevant_docs[0]
  165. # print(
  166. # f"选择最相关的文章: {best_doc['doc_name']} (DocID: {best_doc['doc_id']}, 得分:{best_doc['relevance_score']:.2f})"
  167. # )
  168. # # 4. 获取文章内容
  169. # kms_view_url = settings.KMS_VIEW_URL
  170. # content_payload = {"docid": best_doc["doc_id"]}
  171. # content_response = requests.post(
  172. # kms_view_url, headers=headers, json=content_payload, timeout=10
  173. # )
  174. # if content_response.status_code != 200:
  175. # return f"获取文章内容失败,状态码: {content_response.status_code}"
  176. # content_data = content_response.json()
  177. # doc_html = content_data.get("DocHtml", "")
  178. # plain_text = html_to_text(doc_html)
  179. # # 5. 构建返回结果
  180. # result = f"【知识库文章】\n"
  181. # result += f"标题: {best_doc['doc_name']}\n"
  182. # result += f"相关性得分: {best_doc['relevance_score']:.2f}\n"
  183. # result += f"匹配关键词数量: {best_doc['match_count']}\n"
  184. # if best_doc["keywords"]:
  185. # result += f"文章关键词: {best_doc['keywords']}\n"
  186. # result += f"内容: {plain_text}\n"
  187. # print(f"已获取文章内容,长度: {len(plain_text)} 字符")
  188. # return result
  189. # except Exception as e:
  190. # return f"搜索和获取知识库文章时出错: {e}"