from langchain.tools import tool from typing import List import requests import json import os from .base_tool import ( html_to_text, get_unique_match_count, calculate_relevance_score, find_most_relevant_document, ) from config.settings import settings @tool def get_knowledge_list(filter_words: List[str], match_limit: int = 3) -> str: """根据关键词筛选知识库文章列表 拆分关键词核心原则: # 最小化原则:将用户问题拆分为最小单位的关键词,最好2个字一个关键词 # 例如:"销售订单终止数量失败" → ["销售", "订单", "终止", "数量", "失败"],"销售订单提示没有新建权限怎么办" → ["销售", "订单", "新建", "权限"] Args: filter_words: 关键词列表,匹配任一关键词即返回 match_limit: 最小匹配数(默认3),无结果时可减少重试(最小1) Returns: 文章列表,格式:每行"DocID:DocName|keyword",用于后续获取内容 """ print(f"正在查询知识库列表,筛选关键词:{filter_words} 匹配下限:{match_limit}") kms_list_url = settings.KMS_LIST_URL # os.getenv("KMS_LIST_URL") payload = { "categorycodeList": [], "ignoreTypeSub": False, "ignoreStandardByTopic": True, } headers = { "Accept": "application/json, text/plain, */*", "Content-Type": "application/json", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", } try: response = requests.post( kms_list_url, headers=headers, json=payload, timeout=10 ) if response.status_code == 200: data = response.json() matched_lines = "" for doc in data["docList"]: doc_id = doc["DocID"] doc_name = doc["DocName"] doc_keywords = doc["keyword"] search_text = f"{doc_name} {doc_keywords}".lower() if not filter_words: line = ( f"{doc_id}:{doc_name}|{doc_keywords}" if doc_keywords else f"{doc_id}:{doc_name}" ) matched_lines += line + "\n" else: match_count = get_unique_match_count(search_text, filter_words) if match_count >= match_limit: line = ( f"{doc_id}:{doc_name}|{doc_keywords}" if doc_keywords else f"{doc_id}:{doc_name}" ) matched_lines += line + "\n" return matched_lines else: return f"请求失败,状态码: {response.status_code}" except Exception as e: return f"请求异常: {e}" @tool def get_knowledge_content(docid: str) -> str: """获取知识库文章内容 Args: docid: 知识库文章的DocID Returns: 知识库文章内容 """ print(f"正在获取知识库文章内容,DocID: {docid}") kms_view_url = settings.KMS_VIEW_URL # os.getenv("KMS_VIEW_URL") headers = { "Accept": "application/json, text/plain, */*", "Content-Type": "application/json", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", } try: payload = {"docid": docid} response = requests.post( kms_view_url, headers=headers, json=payload, timeout=10 ) if response.status_code == 200: data = response.json() doc_html = data.get("DocHtml", "") plain_text = html_to_text(doc_html) print(f"已获取到ID: {docid}的文章内容,长度{len(plain_text)}") return plain_text else: return f"请求失败,状态码: {response.status_code}" except Exception as e: return f"请求异常: {e}" # @tool # def search_and_retrieve_knowledge(keywords: List[str], max_matches: int = 10) -> str: # """搜索并获取最相关的知识库文章内容 # 此工具会: # 1. 自动搜索知识库文章列表 # 2. 使用改进的智能算法根据关键词匹配度排序,选择最相关的文章 # 3. 自动获取并返回该文章的完整内容 # 拆分关键词核心原则: # 最小化原则:将用户问题拆分为最小单位的关键词,最好2个字一个关键词 # 例如:"销售订单终止数量失败" → ["销售", "订单", "终止", "数量", "失败"],"销售订单提示没有新建权限怎么办" → ["销售", "订单", "新建", "权限"] # Args: # keywords: 关键词列表,会尽可能细化匹配 # max_matches: 最多考虑的匹配文章数(默认10) # Returns: # 最相关文章的完整内容 # """ # print(f"正在搜索知识库,关键词:{keywords}") # # 1. 获取文章列表 # kms_list_url = settings.KMS_LIST_URL # payload = { # "categorycodeList": [], # "ignoreTypeSub": False, # "ignoreStandardByTopic": True, # } # headers = { # "Accept": "application/json, text/plain, */*", # "Content-Type": "application/json", # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", # } # try: # # 获取所有文章 # response = requests.post( # kms_list_url, headers=headers, json=payload, timeout=10 # ) # if response.status_code != 200: # return f"获取文章列表失败,状态码: {response.status_code}" # data = response.json() # doc_list = data.get("docList", []) # if not doc_list: # return "知识库中没有找到文章" # # 打印所有文章标题用于调试 # print(f"知识库中共有 {len(doc_list)} 篇文章") # print("所有文章标题:") # for i, doc in enumerate(doc_list[:20]): # 只显示前20个 # print(f" {i+1}. {doc['DocName']}") # # 2. 使用改进的算法找到最相关的文档 # relevant_docs = find_most_relevant_document(doc_list, keywords, max_matches) # if not relevant_docs: # # 如果没有找到匹配的文档,尝试使用更宽松的匹配 # print("使用宽松匹配重新搜索...") # # 只保留核心关键词重新搜索 # core_keywords = [kw for kw in keywords if len(kw) >= 2] # if core_keywords: # relevant_docs = find_most_relevant_document( # doc_list, core_keywords, max_matches # ) # if not relevant_docs: # return "没有找到与关键词相关的文章" # print(f"找到 {len(relevant_docs)} 篇相关文章,按相关性排序") # for i, doc in enumerate(relevant_docs): # print( # f" {i+1}. {doc['doc_name']} (得分:{doc['relevance_score']:.2f}, 匹配{doc['match_count']}个关键词)" # ) # # 3. 获取最相关的文章内容 # best_doc = relevant_docs[0] # print( # f"选择最相关的文章: {best_doc['doc_name']} (DocID: {best_doc['doc_id']}, 得分:{best_doc['relevance_score']:.2f})" # ) # # 4. 获取文章内容 # kms_view_url = settings.KMS_VIEW_URL # content_payload = {"docid": best_doc["doc_id"]} # content_response = requests.post( # kms_view_url, headers=headers, json=content_payload, timeout=10 # ) # if content_response.status_code != 200: # return f"获取文章内容失败,状态码: {content_response.status_code}" # content_data = content_response.json() # doc_html = content_data.get("DocHtml", "") # plain_text = html_to_text(doc_html) # # 5. 构建返回结果 # result = f"【知识库文章】\n" # result += f"标题: {best_doc['doc_name']}\n" # result += f"相关性得分: {best_doc['relevance_score']:.2f}\n" # result += f"匹配关键词数量: {best_doc['match_count']}\n" # if best_doc["keywords"]: # result += f"文章关键词: {best_doc['keywords']}\n" # result += f"内容: {plain_text}\n" # print(f"已获取文章内容,长度: {len(plain_text)} 字符") # return result # except Exception as e: # return f"搜索和获取知识库文章时出错: {e}"