From 3a48ac735025274d940a62d53c8cc3d0f20af0b1 Mon Sep 17 00:00:00 2001 From: zxstty Date: Mon, 14 Jul 2025 10:42:13 +0800 Subject: [PATCH 1/9] =?UTF-8?q?chunk=E6=A3=80=E7=B4=A2=E7=9A=84=E6=97=B6?= =?UTF-8?q?=E5=80=99=E8=BF=94=E5=9B=9Edoc=E7=9A=84abstrat=E8=80=8C?= =?UTF-8?q?=E4=B8=8D=E6=98=AFlink?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_chain/apps/base/task/worker/parse_document_worker.py | 3 +-- data_chain/apps/service/chunk_service.py | 6 +++++- data_chain/entities/response_data.py | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/data_chain/apps/base/task/worker/parse_document_worker.py b/data_chain/apps/base/task/worker/parse_document_worker.py index a1b568a5..50a59964 100644 --- a/data_chain/apps/base/task/worker/parse_document_worker.py +++ b/data_chain/apps/base/task/worker/parse_document_worker.py @@ -417,8 +417,7 @@ class ParseDocumentWorker(BaseWorker): if llm is not None: abstract = await TokenTool.get_abstract_by_llm(abstract, llm) else: - keywords = TokenTool.get_top_k_keywords(abstract, 20) - abstract = ' '.join(keywords) + abstract = abstract[:128] abstract_vector = await Embedding.vectorize_embedding(abstract) await DocumentManager.update_document_by_doc_id( doc_id, diff --git a/data_chain/apps/service/chunk_service.py b/data_chain/apps/service/chunk_service.py index e4963464..e6c72100 100644 --- a/data_chain/apps/service/chunk_service.py +++ b/data_chain/apps/service/chunk_service.py @@ -137,8 +137,12 @@ class ChunkService: chunk.text = TokenTool.compress_tokens(chunk.text) dc = DocChunk(docId=chunk_entity.doc_id, docName=chunk_entity.doc_name, chunks=[chunk]) search_chunk_msg.doc_chunks.append(dc) + doc_entities = await DocumentManager.list_document_by_doc_ids( + [doc_chunk.doc_id for doc_chunk in search_chunk_msg.doc_chunks]) + doc_map = {doc_entity.id: doc_entity for doc_entity in doc_entities} for doc_chunk in search_chunk_msg.doc_chunks: - doc_chunk.doc_link = await DocumentService.generate_doc_download_url(doc_chunk.doc_id) + doc_entity = doc_map.get(doc_chunk.doc_id) + doc_chunk.doc_abstract = doc_entity.abstract if doc_entity else "" return search_chunk_msg async def update_chunk_by_id(chunk_id: uuid.UUID, req: UpdateChunkRequest) -> uuid.UUID: diff --git a/data_chain/entities/response_data.py b/data_chain/entities/response_data.py index 4b9c478c..489a6084 100644 --- a/data_chain/entities/response_data.py +++ b/data_chain/entities/response_data.py @@ -326,7 +326,7 @@ class DocChunk(BaseModel): """Post /chunk/search 数据结构""" doc_id: uuid.UUID = Field(description="文档ID", alias="docId") doc_name: str = Field(description="文档名称", alias="docName") - doc_link: str = Field(default="", description="文档链接", alias="docLink") + doc_abstract: str = Field(default="", description="文档摘要", alias="docAbstract") chunks: list[Chunk] = Field(default=[], description="分片列表", alias="chunks") -- Gitee From 1789ca571686dd70df29325db1edcd2f1f3c6dd3 Mon Sep 17 00:00:00 2001 From: zxstty Date: Mon, 14 Jul 2025 11:32:38 +0800 Subject: [PATCH 2/9] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E6=89=A9=E5=B1=95=E5=90=8D=E8=BF=94=E5=9B=9E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_chain/apps/service/chunk_service.py | 1 + data_chain/entities/response_data.py | 1 + 2 files changed, 2 insertions(+) diff --git a/data_chain/apps/service/chunk_service.py b/data_chain/apps/service/chunk_service.py index e6c72100..ed35fe17 100644 --- a/data_chain/apps/service/chunk_service.py +++ b/data_chain/apps/service/chunk_service.py @@ -143,6 +143,7 @@ class ChunkService: for doc_chunk in search_chunk_msg.doc_chunks: doc_entity = doc_map.get(doc_chunk.doc_id) doc_chunk.doc_abstract = doc_entity.abstract if doc_entity else "" + doc_chunk.doc_extension = doc_entity.extension if doc_entity else "" return search_chunk_msg async def update_chunk_by_id(chunk_id: uuid.UUID, req: UpdateChunkRequest) -> uuid.UUID: diff --git a/data_chain/entities/response_data.py b/data_chain/entities/response_data.py index 489a6084..0229c28e 100644 --- a/data_chain/entities/response_data.py +++ b/data_chain/entities/response_data.py @@ -327,6 +327,7 @@ class DocChunk(BaseModel): doc_id: uuid.UUID = Field(description="文档ID", alias="docId") doc_name: str = Field(description="文档名称", alias="docName") doc_abstract: str = Field(default="", description="文档摘要", alias="docAbstract") + doc_extension: str = Field(default="", description="文档扩展名", alias="docExtension") chunks: list[Chunk] = Field(default=[], description="分片列表", alias="chunks") -- Gitee From f2d6ded5dd312fcae2ab8bd6896601bdaca5c0b8 Mon Sep 17 00:00:00 2001 From: zxstty Date: Mon, 14 Jul 2025 11:55:38 +0800 Subject: [PATCH 3/9] =?UTF-8?q?=E8=BF=94=E5=9B=9E=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E5=A4=A7=E5=B0=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_chain/apps/service/chunk_service.py | 1 + data_chain/entities/response_data.py | 1 + 2 files changed, 2 insertions(+) diff --git a/data_chain/apps/service/chunk_service.py b/data_chain/apps/service/chunk_service.py index ed35fe17..d1b70646 100644 --- a/data_chain/apps/service/chunk_service.py +++ b/data_chain/apps/service/chunk_service.py @@ -144,6 +144,7 @@ class ChunkService: doc_entity = doc_map.get(doc_chunk.doc_id) doc_chunk.doc_abstract = doc_entity.abstract if doc_entity else "" doc_chunk.doc_extension = doc_entity.extension if doc_entity else "" + doc_chunk.doc_size = doc_entity.size if doc_entity else 0 return search_chunk_msg async def update_chunk_by_id(chunk_id: uuid.UUID, req: UpdateChunkRequest) -> uuid.UUID: diff --git a/data_chain/entities/response_data.py b/data_chain/entities/response_data.py index 0229c28e..5a3bf03b 100644 --- a/data_chain/entities/response_data.py +++ b/data_chain/entities/response_data.py @@ -328,6 +328,7 @@ class DocChunk(BaseModel): doc_name: str = Field(description="文档名称", alias="docName") doc_abstract: str = Field(default="", description="文档摘要", alias="docAbstract") doc_extension: str = Field(default="", description="文档扩展名", alias="docExtension") + doc_size: int = Field(default=0, description="文档大小,单位是KB", alias="docSize") chunks: list[Chunk] = Field(default=[], description="分片列表", alias="chunks") -- Gitee From 4a203eeb43f44caf11e553f932c490a4fb5432c2 Mon Sep 17 00:00:00 2001 From: zxstty Date: Mon, 14 Jul 2025 19:54:39 +0800 Subject: [PATCH 4/9] =?UTF-8?q?=E5=AE=8C=E5=96=84=E4=B8=80=E8=88=AC?= =?UTF-8?q?=E5=85=B3=E9=94=AE=E5=AD=97=E5=8C=B9=E9=85=8D=E7=9A=84=E9=80=BB?= =?UTF-8?q?=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_chain/manager/chunk_manager.py | 7 ++++++- data_chain/manager/document_manager.py | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/data_chain/manager/chunk_manager.py b/data_chain/manager/chunk_manager.py index a45c9fb3..f9c3e7d5 100644 --- a/data_chain/manager/chunk_manager.py +++ b/data_chain/manager/chunk_manager.py @@ -245,7 +245,12 @@ class ChunkManager(): # 计算相似度分数并选择它 similarity_score = func.ts_rank_cd( func.to_tsvector(tokenizer, ChunkEntity.text), - func.plainto_tsquery(tokenizer, query) + func.to_tsquery( + func.replace( + func.text(func.plainto_tsquery(tokenizer, query)), + '&', '|' + ) + ) ).label("similarity_score") stmt = ( diff --git a/data_chain/manager/document_manager.py b/data_chain/manager/document_manager.py index 973be2b0..a6a4b48c 100644 --- a/data_chain/manager/document_manager.py +++ b/data_chain/manager/document_manager.py @@ -99,7 +99,12 @@ class DocumentManager(): tokenizer = 'english' similarity_score = func.ts_rank_cd( func.to_tsvector(tokenizer, DocumentEntity.abstract), - func.plainto_tsquery(tokenizer, query) + func.to_tsquery( + func.replace( + func.text(func.plainto_tsquery(tokenizer, query)), + '&', '|' + ) + ) ).label("similarity_score") stmt = ( select(DocumentEntity, similarity_score) -- Gitee From c7bc2f818cc4b93316f30bbbc2627c3a50ceba87 Mon Sep 17 00:00:00 2001 From: zxstty Date: Mon, 14 Jul 2025 20:25:21 +0800 Subject: [PATCH 5/9] =?UTF-8?q?=E5=AE=8C=E5=96=84=E6=A3=80=E7=B4=A2?= =?UTF-8?q?=E5=A2=9E=E5=BC=BA=E7=AE=97=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_chain/apps/app.py | 1 + data_chain/entities/enum.py | 1 + data_chain/manager/chunk_manager.py | 24 +++++--- data_chain/manager/document_manager.py | 2 + ...ic_weighted_keyword_and_vector_searcher.py | 58 +++++++++++++++++++ data_chain/rag/keyword_and_vector_searcher.py | 12 ++-- data_chain/rag/keyword_searcher.py | 4 +- 7 files changed, 85 insertions(+), 17 deletions(-) create mode 100644 data_chain/rag/dynamic_weighted_keyword_and_vector_searcher.py diff --git a/data_chain/apps/app.py b/data_chain/apps/app.py index cb4bf7e5..30d7f7d7 100644 --- a/data_chain/apps/app.py +++ b/data_chain/apps/app.py @@ -53,6 +53,7 @@ from data_chain.rag import ( base_searcher, keyword_searcher, vector_searcher, + dynamic_weighted_keyword_and_vector_searcher, keyword_and_vector_searcher, doc2chunk_searcher, doc2chunk_bfs_searcher, diff --git a/data_chain/entities/enum.py b/data_chain/entities/enum.py index 5866976e..90d7c702 100644 --- a/data_chain/entities/enum.py +++ b/data_chain/entities/enum.py @@ -158,6 +158,7 @@ class SearchMethod(str, Enum): """搜索方法""" KEYWORD = "keyword" VECTOR = "vector" + DYNAMIC_WEIGHTED_KEYWORD_AND_VECTOR = "dynamic_weighted_keyword_and_vector" KEYWORD_AND_VECTOR = "keyword_and_vector" DOC2CHUNK = "doc2chunk" DOC2CHUNK_BFS = "doc2chunk_bfs" diff --git a/data_chain/manager/chunk_manager.py b/data_chain/manager/chunk_manager.py index f9c3e7d5..9e6c7e75 100644 --- a/data_chain/manager/chunk_manager.py +++ b/data_chain/manager/chunk_manager.py @@ -224,7 +224,7 @@ class ChunkManager(): async def get_top_k_chunk_by_kb_id_keyword( kb_id: uuid.UUID, query: str, top_k: int, doc_ids: list[uuid.UUID] = None, banned_ids: list[uuid.UUID] = [], - chunk_to_type: str = None, pre_ids: list[uuid.UUID] = None) -> List[ChunkEntity]: + chunk_to_type: str = None, pre_ids: list[uuid.UUID] = None, is_tight: bool = True) -> List[ChunkEntity]: """根据知识库ID和向量查询文档解析结果""" try: async with await DataBase.get_session() as session: @@ -243,15 +243,21 @@ class ChunkManager(): tokenizer = 'zhparser' # 计算相似度分数并选择它 - similarity_score = func.ts_rank_cd( - func.to_tsvector(tokenizer, ChunkEntity.text), - func.to_tsquery( - func.replace( - func.text(func.plainto_tsquery(tokenizer, query)), - '&', '|' + if is_tight: + similarity_score = func.ts_rank_cd( + func.to_tsvector(tokenizer, ChunkEntity.text), + func.to_tsquery(tokenizer, query) + ).label("similarity_score") + else: + similarity_score = func.ts_rank_cd( + func.to_tsvector(tokenizer, ChunkEntity.text), + func.to_tsquery( + func.replace( + func.text(func.plainto_tsquery(tokenizer, query)), + '&', '|' + ) ) - ) - ).label("similarity_score") + ).label("similarity_score") stmt = ( select(ChunkEntity, similarity_score) diff --git a/data_chain/manager/document_manager.py b/data_chain/manager/document_manager.py index a6a4b48c..55cb87a9 100644 --- a/data_chain/manager/document_manager.py +++ b/data_chain/manager/document_manager.py @@ -97,6 +97,7 @@ class DocumentManager(): tokenizer = 'zhparser' elif kb_entity.tokenizer == Tokenizer.EN.value: tokenizer = 'english' + similarity_score = func.ts_rank_cd( func.to_tsvector(tokenizer, DocumentEntity.abstract), func.to_tsquery( @@ -106,6 +107,7 @@ class DocumentManager(): ) ) ).label("similarity_score") + stmt = ( select(DocumentEntity, similarity_score) .where(DocumentEntity.kb_id == kb_id) diff --git a/data_chain/rag/dynamic_weighted_keyword_and_vector_searcher.py b/data_chain/rag/dynamic_weighted_keyword_and_vector_searcher.py new file mode 100644 index 00000000..5efe05ee --- /dev/null +++ b/data_chain/rag/dynamic_weighted_keyword_and_vector_searcher.py @@ -0,0 +1,58 @@ +import asyncio +import uuid +from pydantic import BaseModel, Field +import random +from data_chain.logger.logger import logger as logging +from data_chain.stores.database.database import ChunkEntity +from data_chain.parser.tools.token_tool import TokenTool +from data_chain.manager.chunk_manager import ChunkManager +from data_chain.rag.base_searcher import BaseSearcher +from data_chain.embedding.embedding import Embedding +from data_chain.entities.enum import SearchMethod + + +class KeywordVectorSearcher(BaseSearcher): + """ + 关键词向量检索 + """ + name = SearchMethod.DYNAMIC_WEIGHTED_KEYWORD_AND_VECTOR.value + + @staticmethod + async def search( + query: str, kb_id: uuid.UUID, top_k: int = 5, doc_ids: list[uuid.UUID] = None, + banned_ids: list[uuid.UUID] = [] + ) -> list[ChunkEntity]: + """ + 向量检索 + :param query: 查询 + :param top_k: 返回的结果数量 + :return: 检索结果 + """ + vector = await Embedding.vectorize_embedding(query) + try: + chunk_entities_get_by_keyword = await ChunkManager.get_top_k_chunk_by_kb_id_keyword( + kb_id, query, max(top_k//3, 1), doc_ids, banned_ids) + banned_ids += [chunk_entity.id for chunk_entity in chunk_entities_get_by_keyword] + keywords, weights = TokenTool.get_top_k_keywords_and_weights(query) + logging.error(f"[KeywordVectorSearcher] keywords: {keywords}, weights: {weights}") + chunk_entities_get_by_dynamic_weighted_keyword = await ChunkManager.get_top_k_chunk_by_kb_id_dynamic_weighted_keyword(kb_id, keywords, weights, top_k//2, doc_ids, banned_ids) + banned_ids += [chunk_entity.id for chunk_entity in chunk_entities_get_by_dynamic_weighted_keyword] + chunk_entities_get_by_vector = [] + for _ in range(3): + try: + import time + start_time = time.time() + chunk_entities_get_by_vector = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(chunk_entities_get_by_keyword)-len(chunk_entities_get_by_dynamic_weighted_keyword), doc_ids, banned_ids), timeout=3) + end_time = time.time() + logging.info(f"[KeywordVectorSearcher] 向量检索成功完成,耗时: {end_time - start_time:.2f}秒") + break + except Exception as e: + err = f"[KeywordVectorSearcher] 向量检索失败,error: {e}" + logging.error(err) + continue + chunk_entities = chunk_entities_get_by_keyword + chunk_entities_get_by_dynamic_weighted_keyword + chunk_entities_get_by_vector + except Exception as e: + err = f"[KeywordVectorSearcher] 关键词向量检索失败,error: {e}" + logging.exception(err) + return [] + return chunk_entities diff --git a/data_chain/rag/keyword_and_vector_searcher.py b/data_chain/rag/keyword_and_vector_searcher.py index c3bbe937..602a1a8b 100644 --- a/data_chain/rag/keyword_and_vector_searcher.py +++ b/data_chain/rag/keyword_and_vector_searcher.py @@ -31,13 +31,11 @@ class KeywordVectorSearcher(BaseSearcher): vector = await Embedding.vectorize_embedding(query) try: chunk_entities_get_by_keyword = await ChunkManager.get_top_k_chunk_by_kb_id_keyword( - kb_id, query, max(top_k//3, 1), doc_ids, banned_ids) + kb_id, query, max(top_k//3, 1), doc_ids, banned_ids, is_tight=True) + banned_ids += [chunk_entity.id for chunk_entity in chunk_entities_get_by_keyword] + chunk_entities_get_by_keyword = await ChunkManager.get_top_k_chunk_by_kb_id_keyword( + kb_id, query, max(top_k//2, 1), doc_ids, banned_ids, is_tight=False) banned_ids += [chunk_entity.id for chunk_entity in chunk_entities_get_by_keyword] - keywords, weights = TokenTool.get_top_k_keywords_and_weights(query) - logging.error(f"[KeywordVectorSearcher] keywords: {keywords}, weights: {weights}") - chunk_entities_get_by_dynamic_weighted_keyword = await ChunkManager.get_top_k_chunk_by_kb_id_dynamic_weighted_keyword(kb_id, keywords, weights, top_k//2, doc_ids, banned_ids) - banned_ids += [chunk_entity.id for chunk_entity in chunk_entities_get_by_dynamic_weighted_keyword] - chunk_entities_get_by_vector = [] for _ in range(3): try: import time @@ -50,7 +48,7 @@ class KeywordVectorSearcher(BaseSearcher): err = f"[KeywordVectorSearcher] 向量检索失败,error: {e}" logging.error(err) continue - chunk_entities = chunk_entities_get_by_keyword + chunk_entities_get_by_dynamic_weighted_keyword + chunk_entities_get_by_vector + chunk_entities = chunk_entities_get_by_keyword + chunk_entities_get_by_vector except Exception as e: err = f"[KeywordVectorSearcher] 关键词向量检索失败,error: {e}" logging.exception(err) diff --git a/data_chain/rag/keyword_searcher.py b/data_chain/rag/keyword_searcher.py index a87cedc2..d622055f 100644 --- a/data_chain/rag/keyword_searcher.py +++ b/data_chain/rag/keyword_searcher.py @@ -25,7 +25,9 @@ class KeyWordSearcher(BaseSearcher): :return: 检索结果 """ try: - chunk_entities = await ChunkManager.get_top_k_chunk_by_kb_id_keyword(kb_id, query, top_k, doc_ids, banned_ids) + chunk_entities = await ChunkManager.get_top_k_chunk_by_kb_id_keyword(kb_id, query, top_k//3, doc_ids, banned_ids) + banned_ids += [chunk_entity.id for chunk_entity in chunk_entities] + chunk_entities += await ChunkManager.get_top_k_chunk_by_kb_id_keyword(kb_id, query, top_k-len(chunk_entities), doc_ids, banned_ids, is_tight=False) except Exception as e: err = f"[KeyWordSearcher] 关键词检索失败,error: {e}" logging.exception(err) -- Gitee From e5cfc98e3fe7a691cc3de9b7d3692ce1ad9a2a50 Mon Sep 17 00:00:00 2001 From: zxstty Date: Mon, 14 Jul 2025 21:29:28 +0800 Subject: [PATCH 6/9] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=85=B3=E9=94=AE?= =?UTF-8?q?=E5=AD=97=E5=92=8C=E5=90=91=E9=87=8F=E5=8C=96=E6=A3=80=E7=B4=A2?= =?UTF-8?q?=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_chain/rag/keyword_and_vector_searcher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_chain/rag/keyword_and_vector_searcher.py b/data_chain/rag/keyword_and_vector_searcher.py index 602a1a8b..2d5e9683 100644 --- a/data_chain/rag/keyword_and_vector_searcher.py +++ b/data_chain/rag/keyword_and_vector_searcher.py @@ -33,14 +33,14 @@ class KeywordVectorSearcher(BaseSearcher): chunk_entities_get_by_keyword = await ChunkManager.get_top_k_chunk_by_kb_id_keyword( kb_id, query, max(top_k//3, 1), doc_ids, banned_ids, is_tight=True) banned_ids += [chunk_entity.id for chunk_entity in chunk_entities_get_by_keyword] - chunk_entities_get_by_keyword = await ChunkManager.get_top_k_chunk_by_kb_id_keyword( + chunk_entities_get_by_keyword += await ChunkManager.get_top_k_chunk_by_kb_id_keyword( kb_id, query, max(top_k//2, 1), doc_ids, banned_ids, is_tight=False) banned_ids += [chunk_entity.id for chunk_entity in chunk_entities_get_by_keyword] for _ in range(3): try: import time start_time = time.time() - chunk_entities_get_by_vector = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(chunk_entities_get_by_keyword)-len(chunk_entities_get_by_dynamic_weighted_keyword), doc_ids, banned_ids), timeout=3) + chunk_entities_get_by_vector = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(chunk_entities_get_by_keyword)-len(chunk_entities_get_by_keyword), doc_ids, banned_ids), timeout=3) end_time = time.time() logging.info(f"[KeywordVectorSearcher] 向量检索成功完成,耗时: {end_time - start_time:.2f}秒") break -- Gitee From 2fe1becd6ce242a23610e27eea3bff1e36231675 Mon Sep 17 00:00:00 2001 From: zxstty Date: Mon, 14 Jul 2025 21:40:24 +0800 Subject: [PATCH 7/9] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=85=B3=E9=94=AE?= =?UTF-8?q?=E5=AD=97=E5=8C=B9=E9=85=8D=E5=87=BA=E7=8E=B0=E7=89=B9=E6=AE=8A?= =?UTF-8?q?=E5=AD=97=E7=AC=A6=E5=AF=BC=E8=87=B4=E5=A4=B1=E8=B4=A5=E7=9A=84?= =?UTF-8?q?bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_chain/manager/chunk_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_chain/manager/chunk_manager.py b/data_chain/manager/chunk_manager.py index 9e6c7e75..277abc3e 100644 --- a/data_chain/manager/chunk_manager.py +++ b/data_chain/manager/chunk_manager.py @@ -246,7 +246,7 @@ class ChunkManager(): if is_tight: similarity_score = func.ts_rank_cd( func.to_tsvector(tokenizer, ChunkEntity.text), - func.to_tsquery(tokenizer, query) + func.plainto_tsquery(tokenizer, query) ).label("similarity_score") else: similarity_score = func.ts_rank_cd( -- Gitee From a5fae6e4556ce45caea4ec92efbf082072d2d61b Mon Sep 17 00:00:00 2001 From: zxstty Date: Tue, 15 Jul 2025 09:19:27 +0800 Subject: [PATCH 8/9] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=85=B3=E9=94=AE?= =?UTF-8?q?=E5=AD=97&=E5=90=91=E9=87=8F=E5=8C=96=E6=B7=B7=E5=90=88?= =?UTF-8?q?=E6=A3=80=E7=B4=A2=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_chain/rag/keyword_and_vector_searcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_chain/rag/keyword_and_vector_searcher.py b/data_chain/rag/keyword_and_vector_searcher.py index 2d5e9683..86b3b4f5 100644 --- a/data_chain/rag/keyword_and_vector_searcher.py +++ b/data_chain/rag/keyword_and_vector_searcher.py @@ -40,7 +40,7 @@ class KeywordVectorSearcher(BaseSearcher): try: import time start_time = time.time() - chunk_entities_get_by_vector = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(chunk_entities_get_by_keyword)-len(chunk_entities_get_by_keyword), doc_ids, banned_ids), timeout=3) + chunk_entities_get_by_vector = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(chunk_entities_get_by_keyword), doc_ids, banned_ids), timeout=3) end_time = time.time() logging.info(f"[KeywordVectorSearcher] 向量检索成功完成,耗时: {end_time - start_time:.2f}秒") break -- Gitee From 4ba374e885558f3556e4e1d93aee7b3ebf2570b3 Mon Sep 17 00:00:00 2001 From: zxstty Date: Tue, 15 Jul 2025 11:00:03 +0800 Subject: [PATCH 9/9] =?UTF-8?q?=E6=94=BE=E8=A1=8C=E6=89=80=E6=9C=89?= =?UTF-8?q?=E7=94=A8=E6=88=B7=E5=AF=B9=E9=BB=98=E8=AE=A4=E8=B5=84=E4=BA=A7?= =?UTF-8?q?=E5=BA=93=E7=9A=84=E8=AE=BF=E9=97=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_chain/apps/service/document_service.py | 2 ++ data_chain/apps/service/knwoledge_base_service.py | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/data_chain/apps/service/document_service.py b/data_chain/apps/service/document_service.py index 2c43a07a..dfe65b9d 100644 --- a/data_chain/apps/service/document_service.py +++ b/data_chain/apps/service/document_service.py @@ -39,6 +39,8 @@ class DocumentService: """验证用户对文档的操作权限""" try: doc_entity = await DocumentManager.get_document_by_doc_id(doc_id) + if doc_entity.kb_id == DEFAULT_KNOWLEDGE_BASE_ID: + return True if doc_entity is None: err = f"文档不存在, 文档ID: {doc_id}" logging.error("[DocumentService] %s", err) diff --git a/data_chain/apps/service/knwoledge_base_service.py b/data_chain/apps/service/knwoledge_base_service.py index 8dcc3daf..17b445dc 100644 --- a/data_chain/apps/service/knwoledge_base_service.py +++ b/data_chain/apps/service/knwoledge_base_service.py @@ -23,7 +23,7 @@ from data_chain.entities.response_data import ( from data_chain.apps.base.zip_handler import ZipHandler from data_chain.apps.service.task_queue_service import TaskQueueService from data_chain.entities.enum import Tokenizer, ParseMethod, TeamType, TeamStatus, KnowledgeBaseStatus, TaskType -from data_chain.entities.common import DEFAULT_DOC_TYPE_ID, default_roles, IMPORT_KB_PATH_IN_OS, EXPORT_KB_PATH_IN_MINIO, IMPORT_KB_PATH_IN_MINIO +from data_chain.entities.common import DEFAULT_KNOWLEDGE_BASE_ID, DEFAULT_DOC_TYPE_ID, default_roles, IMPORT_KB_PATH_IN_OS, EXPORT_KB_PATH_IN_MINIO, IMPORT_KB_PATH_IN_MINIO from data_chain.stores.database.database import TeamEntity, KnowledgeBaseEntity, DocumentTypeEntity from data_chain.stores.minio.minio import MinIO from data_chain.apps.base.convertor import Convertor @@ -46,6 +46,8 @@ class KnowledgeBaseService: async def validate_user_action_to_knowledge_base( user_sub: str, kb_id: uuid.UUID, action: str) -> bool: """验证用户在知识库中的操作权限""" + if kb_id == DEFAULT_KNOWLEDGE_BASE_ID: + return True try: kb_entity = await KnowledgeBaseManager.get_knowledge_base_by_kb_id(kb_id) if kb_entity is None: -- Gitee