|
|
@@ -3,24 +3,138 @@ from typing import List
|
|
|
import requests
|
|
|
import json
|
|
|
import os
|
|
|
-from .base_tool import html_to_text, get_unique_match_count
|
|
|
+from .base_tool import (
|
|
|
+ html_to_text,
|
|
|
+ get_unique_match_count,
|
|
|
+ calculate_relevance_score,
|
|
|
+ find_most_relevant_document,
|
|
|
+)
|
|
|
from config.settings import settings
|
|
|
|
|
|
|
|
|
-# @tool
|
|
|
-# def get_knowledge_list(filter_words: List[str], match_limit: int = 3) -> str:
|
|
|
-# """根据关键词筛选知识库文章列表
|
|
|
+@tool
|
|
|
+def get_knowledge_list(filter_words: List[str], match_limit: int = 3) -> str:
|
|
|
+ """根据关键词筛选知识库文章列表
|
|
|
+ 拆分关键词核心原则:
|
|
|
+ # 最小化原则:将用户问题拆分为最小单位的关键词,最好2个字一个关键词
|
|
|
+ # 例如:"销售订单终止数量失败" → ["销售", "订单", "终止", "数量", "失败"],"销售订单提示没有新建权限怎么办" → ["销售", "订单", "新建", "权限"]
|
|
|
+ Args:
|
|
|
+ filter_words: 关键词列表,匹配任一关键词即返回
|
|
|
+ match_limit: 最小匹配数(默认3),无结果时可减少重试(最小1)
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 文章列表,格式:每行"DocID:DocName|keyword",用于后续获取内容
|
|
|
+ """
|
|
|
+ print(f"正在查询知识库列表,筛选关键词:{filter_words} 匹配下限:{match_limit}")
|
|
|
+
|
|
|
+ kms_list_url = settings.KMS_LIST_URL # os.getenv("KMS_LIST_URL")
|
|
|
+ payload = {
|
|
|
+ "categorycodeList": [],
|
|
|
+ "ignoreTypeSub": False,
|
|
|
+ "ignoreStandardByTopic": True,
|
|
|
+ }
|
|
|
+
|
|
|
+ headers = {
|
|
|
+ "Accept": "application/json, text/plain, */*",
|
|
|
+ "Content-Type": "application/json",
|
|
|
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
|
|
+ }
|
|
|
+
|
|
|
+ try:
|
|
|
+ response = requests.post(
|
|
|
+ kms_list_url, headers=headers, json=payload, timeout=10
|
|
|
+ )
|
|
|
+ if response.status_code == 200:
|
|
|
+ data = response.json()
|
|
|
+ matched_lines = ""
|
|
|
+ for doc in data["docList"]:
|
|
|
+ doc_id = doc["DocID"]
|
|
|
+ doc_name = doc["DocName"]
|
|
|
+ doc_keywords = doc["keyword"]
|
|
|
+ search_text = f"{doc_name} {doc_keywords}".lower()
|
|
|
+
|
|
|
+ if not filter_words:
|
|
|
+ line = (
|
|
|
+ f"{doc_id}:{doc_name}|{doc_keywords}"
|
|
|
+ if doc_keywords
|
|
|
+ else f"{doc_id}:{doc_name}"
|
|
|
+ )
|
|
|
+ matched_lines += line + "\n"
|
|
|
+ else:
|
|
|
+ match_count = get_unique_match_count(search_text, filter_words)
|
|
|
+ if match_count >= match_limit:
|
|
|
+ line = (
|
|
|
+ f"{doc_id}:{doc_name}|{doc_keywords}"
|
|
|
+ if doc_keywords
|
|
|
+ else f"{doc_id}:{doc_name}"
|
|
|
+ )
|
|
|
+ matched_lines += line + "\n"
|
|
|
+
|
|
|
+ return matched_lines
|
|
|
+ else:
|
|
|
+ return f"请求失败,状态码: {response.status_code}"
|
|
|
+ except Exception as e:
|
|
|
+ return f"请求异常: {e}"
|
|
|
+
|
|
|
|
|
|
+@tool
|
|
|
+def get_knowledge_content(docid: str) -> str:
|
|
|
+ """获取知识库文章内容
|
|
|
+
|
|
|
+ Args:
|
|
|
+ docid: 知识库文章的DocID
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 知识库文章内容
|
|
|
+ """
|
|
|
+ print(f"正在获取知识库文章内容,DocID: {docid}")
|
|
|
+
|
|
|
+ kms_view_url = settings.KMS_VIEW_URL # os.getenv("KMS_VIEW_URL")
|
|
|
+ headers = {
|
|
|
+ "Accept": "application/json, text/plain, */*",
|
|
|
+ "Content-Type": "application/json",
|
|
|
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
|
|
+ }
|
|
|
+
|
|
|
+ try:
|
|
|
+ payload = {"docid": docid}
|
|
|
+ response = requests.post(
|
|
|
+ kms_view_url, headers=headers, json=payload, timeout=10
|
|
|
+ )
|
|
|
+ if response.status_code == 200:
|
|
|
+ data = response.json()
|
|
|
+ doc_html = data.get("DocHtml", "")
|
|
|
+ plain_text = html_to_text(doc_html)
|
|
|
+ print(f"已获取到ID: {docid}的文章内容,长度{len(plain_text)}")
|
|
|
+ return plain_text
|
|
|
+ else:
|
|
|
+ return f"请求失败,状态码: {response.status_code}"
|
|
|
+ except Exception as e:
|
|
|
+ return f"请求异常: {e}"
|
|
|
+
|
|
|
+
|
|
|
+# @tool
|
|
|
+# def search_and_retrieve_knowledge(keywords: List[str], max_matches: int = 10) -> str:
|
|
|
+# """搜索并获取最相关的知识库文章内容
|
|
|
+
|
|
|
+# 此工具会:
|
|
|
+# 1. 自动搜索知识库文章列表
|
|
|
+# 2. 使用改进的智能算法根据关键词匹配度排序,选择最相关的文章
|
|
|
+# 3. 自动获取并返回该文章的完整内容
|
|
|
+# 拆分关键词核心原则:
|
|
|
+# 最小化原则:将用户问题拆分为最小单位的关键词,最好2个字一个关键词
|
|
|
+# 例如:"销售订单终止数量失败" → ["销售", "订单", "终止", "数量", "失败"],"销售订单提示没有新建权限怎么办" → ["销售", "订单", "新建", "权限"]
|
|
|
# Args:
|
|
|
-# filter_words: 关键词列表,匹配任一关键词即返回
|
|
|
-# match_limit: 最小匹配数(默认3),无结果时可减少重试(最小1)
|
|
|
+# keywords: 关键词列表,会尽可能细化匹配
|
|
|
+# max_matches: 最多考虑的匹配文章数(默认10)
|
|
|
|
|
|
# Returns:
|
|
|
-# 文章列表,格式:每行"DocID:DocName|keyword",用于后续获取内容
|
|
|
+# 最相关文章的完整内容
|
|
|
# """
|
|
|
-# print(f"正在查询知识库列表,筛选关键词:{filter_words} 匹配下限:{match_limit}")
|
|
|
+# print(f"正在搜索知识库,关键词:{keywords}")
|
|
|
|
|
|
-# kms_list_url = settings.KMS_LIST_URL # os.getenv("KMS_LIST_URL")
|
|
|
+# # 1. 获取文章列表
|
|
|
+# kms_list_url = settings.KMS_LIST_URL
|
|
|
# payload = {
|
|
|
# "categorycodeList": [],
|
|
|
# "ignoreTypeSub": False,
|
|
|
@@ -34,183 +148,78 @@ from config.settings import settings
|
|
|
# }
|
|
|
|
|
|
# try:
|
|
|
+# # 获取所有文章
|
|
|
# response = requests.post(
|
|
|
# kms_list_url, headers=headers, json=payload, timeout=10
|
|
|
# )
|
|
|
-# if response.status_code == 200:
|
|
|
-# data = response.json()
|
|
|
-# matched_lines = ""
|
|
|
-# for doc in data["docList"]:
|
|
|
-# doc_id = doc["DocID"]
|
|
|
-# doc_name = doc["DocName"]
|
|
|
-# doc_keywords = doc["keyword"]
|
|
|
-# search_text = f"{doc_name} {doc_keywords}".lower()
|
|
|
-
|
|
|
-# if not filter_words:
|
|
|
-# line = (
|
|
|
-# f"{doc_id}:{doc_name}|{doc_keywords}"
|
|
|
-# if doc_keywords
|
|
|
-# else f"{doc_id}:{doc_name}"
|
|
|
-# )
|
|
|
-# matched_lines += line + "\n"
|
|
|
-# else:
|
|
|
-# match_count = get_unique_match_count(search_text, filter_words)
|
|
|
-# if match_count >= match_limit:
|
|
|
-# line = (
|
|
|
-# f"{doc_id}:{doc_name}|{doc_keywords}"
|
|
|
-# if doc_keywords
|
|
|
-# else f"{doc_id}:{doc_name}"
|
|
|
-# )
|
|
|
-# matched_lines += line + "\n"
|
|
|
-
|
|
|
-# return matched_lines
|
|
|
-# else:
|
|
|
-# return f"请求失败,状态码: {response.status_code}"
|
|
|
-# except Exception as e:
|
|
|
-# return f"请求异常: {e}"
|
|
|
-
|
|
|
-
|
|
|
-# @tool
|
|
|
-# def get_knowledge_content(docid: str) -> str:
|
|
|
-# """获取知识库文章内容
|
|
|
-
|
|
|
-# Args:
|
|
|
-# docid: 知识库文章的DocID
|
|
|
-
|
|
|
-# Returns:
|
|
|
-# 知识库文章内容
|
|
|
-# """
|
|
|
-# print(f"正在获取知识库文章内容,DocID: {docid}")
|
|
|
-
|
|
|
-# kms_view_url = settings.KMS_VIEW_URL # os.getenv("KMS_VIEW_URL")
|
|
|
-# headers = {
|
|
|
-# "Accept": "application/json, text/plain, */*",
|
|
|
-# "Content-Type": "application/json",
|
|
|
-# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
|
|
-# }
|
|
|
-
|
|
|
-# try:
|
|
|
-# payload = {"docid": docid}
|
|
|
-# response = requests.post(
|
|
|
-# kms_view_url, headers=headers, json=payload, timeout=10
|
|
|
+# if response.status_code != 200:
|
|
|
+# return f"获取文章列表失败,状态码: {response.status_code}"
|
|
|
+
|
|
|
+# data = response.json()
|
|
|
+# doc_list = data.get("docList", [])
|
|
|
+
|
|
|
+# if not doc_list:
|
|
|
+# return "知识库中没有找到文章"
|
|
|
+
|
|
|
+# # 打印所有文章标题用于调试
|
|
|
+# print(f"知识库中共有 {len(doc_list)} 篇文章")
|
|
|
+# print("所有文章标题:")
|
|
|
+# for i, doc in enumerate(doc_list[:20]): # 只显示前20个
|
|
|
+# print(f" {i+1}. {doc['DocName']}")
|
|
|
+
|
|
|
+# # 2. 使用改进的算法找到最相关的文档
|
|
|
+# relevant_docs = find_most_relevant_document(doc_list, keywords, max_matches)
|
|
|
+
|
|
|
+# if not relevant_docs:
|
|
|
+# # 如果没有找到匹配的文档,尝试使用更宽松的匹配
|
|
|
+# print("使用宽松匹配重新搜索...")
|
|
|
+# # 只保留核心关键词重新搜索
|
|
|
+# core_keywords = [kw for kw in keywords if len(kw) >= 2]
|
|
|
+# if core_keywords:
|
|
|
+# relevant_docs = find_most_relevant_document(
|
|
|
+# doc_list, core_keywords, max_matches
|
|
|
+# )
|
|
|
+
|
|
|
+# if not relevant_docs:
|
|
|
+# return "没有找到与关键词相关的文章"
|
|
|
+
|
|
|
+# print(f"找到 {len(relevant_docs)} 篇相关文章,按相关性排序")
|
|
|
+# for i, doc in enumerate(relevant_docs):
|
|
|
+# print(
|
|
|
+# f" {i+1}. {doc['doc_name']} (得分:{doc['relevance_score']:.2f}, 匹配{doc['match_count']}个关键词)"
|
|
|
+# )
|
|
|
+
|
|
|
+# # 3. 获取最相关的文章内容
|
|
|
+# best_doc = relevant_docs[0]
|
|
|
+# print(
|
|
|
+# f"选择最相关的文章: {best_doc['doc_name']} (DocID: {best_doc['doc_id']}, 得分:{best_doc['relevance_score']:.2f})"
|
|
|
# )
|
|
|
-# if response.status_code == 200:
|
|
|
-# data = response.json()
|
|
|
-# doc_html = data.get("DocHtml", "")
|
|
|
-# plain_text = html_to_text(doc_html)
|
|
|
-# print(f"已获取到ID: {docid}的文章内容,长度{len(plain_text)}")
|
|
|
-# return plain_text
|
|
|
-# else:
|
|
|
-# return f"请求失败,状态码: {response.status_code}"
|
|
|
-# except Exception as e:
|
|
|
-# return f"请求异常: {e}"
|
|
|
-
|
|
|
|
|
|
-@tool
|
|
|
-def search_and_retrieve_knowledge(keywords: List[str], max_matches: int = 5) -> str:
|
|
|
- """搜索并获取最相关的知识库文章内容
|
|
|
-
|
|
|
- 此工具会:
|
|
|
- 1. 自动搜索知识库文章列表
|
|
|
- 2. 根据关键词匹配度排序,选择匹配最多关键词的文章
|
|
|
- 3. 自动获取并返回该文章的完整内容
|
|
|
- 拆分关键词核心原则:
|
|
|
- 最小化原则:将用户问题拆分为最小单位的关键词,最好2个字一个关键词
|
|
|
- 例如:"销售订单终止数量失败" → ["销售", "订单", "终止", "数量", "失败"],"销售订单提示没有新建权限怎么办" → ["销售", "订单", "新建", "权限"]
|
|
|
- Args:
|
|
|
- keywords: 关键词列表,会尽可能细化匹配
|
|
|
- max_matches: 最多考虑的匹配文章数(默认5)
|
|
|
-
|
|
|
- Returns:
|
|
|
- 最相关文章的完整内容
|
|
|
- """
|
|
|
- print(f"正在搜索知识库,关键词:{keywords}")
|
|
|
-
|
|
|
- # 1. 获取文章列表
|
|
|
- kms_list_url = settings.KMS_LIST_URL
|
|
|
- payload = {
|
|
|
- "categorycodeList": [],
|
|
|
- "ignoreTypeSub": False,
|
|
|
- "ignoreStandardByTopic": True,
|
|
|
- }
|
|
|
-
|
|
|
- headers = {
|
|
|
- "Accept": "application/json, text/plain, */*",
|
|
|
- "Content-Type": "application/json",
|
|
|
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
|
|
- }
|
|
|
-
|
|
|
- try:
|
|
|
- # 获取所有文章
|
|
|
- response = requests.post(
|
|
|
- kms_list_url, headers=headers, json=payload, timeout=10
|
|
|
- )
|
|
|
- if response.status_code != 200:
|
|
|
- return f"获取文章列表失败,状态码: {response.status_code}"
|
|
|
-
|
|
|
- data = response.json()
|
|
|
- doc_list = data.get("docList", [])
|
|
|
-
|
|
|
- if not doc_list:
|
|
|
- return "知识库中没有找到文章"
|
|
|
-
|
|
|
- # 2. 计算匹配度并排序
|
|
|
- matched_docs = []
|
|
|
- for doc in doc_list:
|
|
|
- doc_id = doc["DocID"]
|
|
|
- doc_name = doc["DocName"]
|
|
|
- doc_keywords = doc["keyword"]
|
|
|
- search_text = f"{doc_name} {doc_keywords}".lower()
|
|
|
-
|
|
|
- # 计算匹配的关键词数量
|
|
|
- match_count = get_unique_match_count(search_text, keywords)
|
|
|
- if match_count > 0:
|
|
|
- matched_docs.append({
|
|
|
- "doc_id": doc_id,
|
|
|
- "doc_name": doc_name,
|
|
|
- "match_count": match_count,
|
|
|
- "keywords": doc_keywords
|
|
|
- })
|
|
|
-
|
|
|
- if not matched_docs:
|
|
|
- return "没有找到与关键词相关的文章"
|
|
|
-
|
|
|
- # 按匹配数量降序排序,选择前max_matches个
|
|
|
- matched_docs.sort(key=lambda x: x["match_count"], reverse=True)
|
|
|
- top_docs = matched_docs[:max_matches]
|
|
|
-
|
|
|
- print(f"找到 {len(matched_docs)} 篇相关文章,前{len(top_docs)}篇匹配度最高")
|
|
|
- for i, doc in enumerate(top_docs):
|
|
|
- print(f" {i+1}. {doc['doc_name']} (匹配{doc['match_count']}个关键词)")
|
|
|
-
|
|
|
- # 3. 获取匹配度最高的文章内容
|
|
|
- best_doc = top_docs[0]
|
|
|
- print(f"选择最相关的文章: {best_doc['doc_name']} (DocID: {best_doc['doc_id']})")
|
|
|
-
|
|
|
- # 4. 获取文章内容
|
|
|
- kms_view_url = settings.KMS_VIEW_URL
|
|
|
- content_payload = {"docid": best_doc['doc_id']}
|
|
|
- content_response = requests.post(
|
|
|
- kms_view_url, headers=headers, json=content_payload, timeout=10
|
|
|
- )
|
|
|
+# # 4. 获取文章内容
|
|
|
+# kms_view_url = settings.KMS_VIEW_URL
|
|
|
+# content_payload = {"docid": best_doc["doc_id"]}
|
|
|
+# content_response = requests.post(
|
|
|
+# kms_view_url, headers=headers, json=content_payload, timeout=10
|
|
|
+# )
|
|
|
|
|
|
- if content_response.status_code != 200:
|
|
|
- return f"获取文章内容失败,状态码: {content_response.status_code}"
|
|
|
+# if content_response.status_code != 200:
|
|
|
+# return f"获取文章内容失败,状态码: {content_response.status_code}"
|
|
|
|
|
|
- content_data = content_response.json()
|
|
|
- doc_html = content_data.get("DocHtml", "")
|
|
|
- plain_text = html_to_text(doc_html)
|
|
|
+# content_data = content_response.json()
|
|
|
+# doc_html = content_data.get("DocHtml", "")
|
|
|
+# plain_text = html_to_text(doc_html)
|
|
|
|
|
|
- # 5. 构建返回结果
|
|
|
- result = f"【知识库文章】\n"
|
|
|
- result += f"标题: {best_doc['doc_name']}\n"
|
|
|
- result += f"匹配关键词数量: {best_doc['match_count']}\n"
|
|
|
- if best_doc['keywords']:
|
|
|
- result += f"文章关键词: {best_doc['keywords']}\n"
|
|
|
- result += f"内容: {plain_text}\n"
|
|
|
+# # 5. 构建返回结果
|
|
|
+# result = f"【知识库文章】\n"
|
|
|
+# result += f"标题: {best_doc['doc_name']}\n"
|
|
|
+# result += f"相关性得分: {best_doc['relevance_score']:.2f}\n"
|
|
|
+# result += f"匹配关键词数量: {best_doc['match_count']}\n"
|
|
|
+# if best_doc["keywords"]:
|
|
|
+# result += f"文章关键词: {best_doc['keywords']}\n"
|
|
|
+# result += f"内容: {plain_text}\n"
|
|
|
|
|
|
- print(f"已获取文章内容,长度: {len(plain_text)} 字符")
|
|
|
- return result
|
|
|
+# print(f"已获取文章内容,长度: {len(plain_text)} 字符")
|
|
|
+# return result
|
|
|
|
|
|
- except Exception as e:
|
|
|
- return f"搜索和获取知识库文章时出错: {e}"
|
|
|
+# except Exception as e:
|
|
|
+# return f"搜索和获取知识库文章时出错: {e}"
|