diff --git a/data_chain/manager/chunk_manager.py b/data_chain/manager/chunk_manager.py index 7f83ee4dc54ed55d058b87b9f61a2875b3be22d4..a3488aba3813fb2106b5bcf0dcd55cc97294ecd3 100644 --- a/data_chain/manager/chunk_manager.py +++ b/data_chain/manager/chunk_manager.py @@ -265,13 +265,13 @@ class ChunkManager(): JOIN document ON document.id = chunk.doc_id WHERE {where_clause} AND (chunk.text_vector <=> :vector) IS NOT NULL - AND (chunk.text_vector <=> :vector) = (chunk.text_vector <=> :vector) ORDER BY similarity_score ASC NULLS LAST LIMIT :limit """ # -------------------------- # 原有逻辑:执行查询与结果处理(完全保留) # -------------------------- + logging.error(f"执行向量查询SQL: {base_sql},参数: {params}") result = await session.execute(text(base_sql), params) rows = result.fetchall() diff --git a/data_chain/rag/doc2chunk_bfs_searcher.py b/data_chain/rag/doc2chunk_bfs_searcher.py index 629e8d230b3095ec7e7d8352e673718e244f9ce5..e294b9102e3e64fd8a2c5b5a11a559d6b6236996 100644 --- a/data_chain/rag/doc2chunk_bfs_searcher.py +++ b/data_chain/rag/doc2chunk_bfs_searcher.py @@ -37,7 +37,7 @@ class Doc2ChunkBfsSearcher(BaseSearcher): root_chunk_entities_vector = [] for _ in range(3): try: - root_chunk_entities_vector = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(root_chunk_entities_keyword), doc_ids, banned_ids, ChunkParseTopology.TREEROOT.value), timeout=20) + root_chunk_entities_vector = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(root_chunk_entities_keyword), doc_ids, banned_ids, ChunkParseTopology.TREEROOT.value), timeout=300) break except Exception as e: err = f"[KeywordVectorSearcher] 向量检索失败,error: {e}" diff --git a/data_chain/rag/doc2chunk_searcher.py b/data_chain/rag/doc2chunk_searcher.py index 774048d5bc62aa6b54f1422b4daff2f9cbb91bf7..ec149dfc07f74977e3200af12b496e63a7b5b5b2 100644 --- a/data_chain/rag/doc2chunk_searcher.py +++ b/data_chain/rag/doc2chunk_searcher.py @@ -37,7 +37,7 @@ class Doc2ChunkSearcher(BaseSearcher): doc_entities_vector = [] for _ in range(3): try: - doc_entities_vector = await asyncio.wait_for(DocumentManager.get_top_k_document_by_kb_id_vector(kb_id, vector, top_k-len(doc_entities_keyword), use_doc_ids, banned_ids), timeout=10) + doc_entities_vector = await asyncio.wait_for(DocumentManager.get_top_k_document_by_kb_id_vector(kb_id, vector, top_k-len(doc_entities_keyword), use_doc_ids, banned_ids), timeout=300) break except Exception as e: err = f"[KeywordVectorSearcher] 向量检索失败,error: {e}" @@ -51,7 +51,7 @@ class Doc2ChunkSearcher(BaseSearcher): chunk_entities_vector = [] for _ in range(3): try: - chunk_entities_vector = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(chunk_entities_keyword), use_doc_ids, banned_ids), timeout=10) + chunk_entities_vector = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(chunk_entities_keyword), use_doc_ids, banned_ids), timeout=300) break except Exception as e: err = f"[KeywordVectorSearcher] 向量检索失败,error: {e}" diff --git a/data_chain/rag/dynamic_weighted_keyword_and_vector_searcher.py b/data_chain/rag/dynamic_weighted_keyword_and_vector_searcher.py index aed0ffcc25010f2cc0cede28c4f7cbe30ae390eb..320601329d16a0602537e32316c64cf38a9673c2 100644 --- a/data_chain/rag/dynamic_weighted_keyword_and_vector_searcher.py +++ b/data_chain/rag/dynamic_weighted_keyword_and_vector_searcher.py @@ -48,7 +48,7 @@ class DynamicKeywordVectorSearcher(BaseSearcher): start_time = time.time() logging.error( f"[DynamicKeywordVectorSearcher] 开始进行向量检索,top_k: {top_k-len(chunk_entities_get_by_keyword)-len(chunk_entities_get_by_dynamic_weighted_keyword)}") - chunk_entities_get_by_vector = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(chunk_entities_get_by_keyword)-len(chunk_entities_get_by_dynamic_weighted_keyword), doc_ids, banned_ids), timeout=20) + chunk_entities_get_by_vector = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(chunk_entities_get_by_keyword)-len(chunk_entities_get_by_dynamic_weighted_keyword), doc_ids, banned_ids), timeout=300) end_time = time.time() logging.info( f"[DynamicKeywordVectorSearcher] 向量检索成功完成,耗时: {end_time - start_time:.2f}秒") diff --git a/data_chain/rag/enhanced_by_llm_searcher.py b/data_chain/rag/enhanced_by_llm_searcher.py index b27cf7294ec2928b593e8bcc45f124551f60c65c..c6a9851cb2e4bbb7e926e40bfeecec2209718283 100644 --- a/data_chain/rag/enhanced_by_llm_searcher.py +++ b/data_chain/rag/enhanced_by_llm_searcher.py @@ -58,7 +58,7 @@ class EnhancedByLLMSearcher(BaseSearcher): sub_chunk_entities_vector = [] for _ in range(3): try: - sub_chunk_entities_vector = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k, doc_ids, banned_ids), timeout=20) + sub_chunk_entities_vector = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k, doc_ids, banned_ids), timeout=300) break except Exception as e: err = f"[EnhancedByLLMSearcher] 向量检索失败,error: {e}" diff --git a/data_chain/rag/keyword_and_vector_searcher.py b/data_chain/rag/keyword_and_vector_searcher.py index 9a0c7de20b63cae03044d217406caa4f6c1a3939..c500a9ca1bf0d85473576c5c785d7a6d89eded2a 100644 --- a/data_chain/rag/keyword_and_vector_searcher.py +++ b/data_chain/rag/keyword_and_vector_searcher.py @@ -40,7 +40,7 @@ class KeywordVectorSearcher(BaseSearcher): try: import time start_time = time.time() - chunk_entities_get_by_vector = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(chunk_entities_get_by_keyword), doc_ids, banned_ids), timeout=20) + chunk_entities_get_by_vector = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k-len(chunk_entities_get_by_keyword), doc_ids, banned_ids), timeout=300) end_time = time.time() logging.info(f"[KeywordVectorSearcher] 向量检索成功完成,耗时: {end_time - start_time:.2f}秒") break diff --git a/data_chain/rag/vector_searcher.py b/data_chain/rag/vector_searcher.py index 1bd1d0cac655c2196db2232d84af26da3b3e02fe..eb8115ebdef8fb4bbe4f8c7e19fbc79c24457d3f 100644 --- a/data_chain/rag/vector_searcher.py +++ b/data_chain/rag/vector_searcher.py @@ -29,7 +29,7 @@ class VectorSearcher(BaseSearcher): chunk_entities = [] for _ in range(3): try: - chunk_entities = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k, doc_ids, banned_ids), timeout=20) + chunk_entities = await asyncio.wait_for(ChunkManager.get_top_k_chunk_by_kb_id_vector(kb_id, vector, top_k, doc_ids, banned_ids), timeout=300) break except Exception as e: err = f"[VectorSearcher] 向量检索失败,error: {e}" diff --git a/data_chain/stores/database/database.py b/data_chain/stores/database/database.py index 96efb585957ab1621f65796476c03e055b95e5eb..d0cd3961d26281fa4a6b9cc586369df078b779fe 100644 --- a/data_chain/stores/database/database.py +++ b/data_chain/stores/database/database.py @@ -64,7 +64,6 @@ class TeamEntity(Base): ) # 添加索引 __table_args__ = ( - Index('team_id_index', id), Index('team_name_index', name), Index('team_author_id_index', author_id) ) @@ -94,7 +93,6 @@ class TeamMessageEntity(Base): # 添加索引 __table_args__ = ( - Index('team_message_id_index', id), Index('team_message_team_id_index', team_id), Index('team_message_author_id_index', author_id) ) @@ -121,7 +119,6 @@ class RoleEntity(Base): ) # 添加索引 __table_args__ = ( - Index('role_id_index', id), Index('role_team_id_index', team_id), Index('role_name_index', name) ) @@ -165,7 +162,6 @@ class RoleActionEntity(Base): # 添加索引 __table_args__ = ( - Index('role_action_id_index', id), Index('role_action_role_id_index', role_id), Index('role_action_action_index', action) ) @@ -190,8 +186,7 @@ class UserEntity(Base): # 添加索引 __table_args__ = ( - Index('user_id_index', id), - Index('user_name_index', name) + Index('user_name_index', name), ) @@ -219,7 +214,6 @@ class UserMessageEntity(Base): # 添加索引 __table_args__ = ( - Index('user_message_id_index', id), Index('user_message_sender_id_index', sender_id), Index('user_message_receiver_id_index', receiver_id) ) @@ -245,7 +239,6 @@ class TeamUserEntity(Base): # 添加索引 __table_args__ = ( - Index('team_user_id_index', id), Index('team_user_team_id_index', team_id), Index('team_user_user_id_index', user_id) ) @@ -271,7 +264,6 @@ class UserRoleEntity(Base): # 添加索引 __table_args__ = ( - Index('user_role_id_index', id), Index('user_role_team_id_index', team_id), Index('user_role_user_id_index', user_id) ) @@ -312,7 +304,6 @@ class KnowledgeBaseEntity(Base): # 添加索引 __table_args__ = ( - Index('knowledge_base_id_index', id), Index('knowledge_base_team_id_index', team_id), Index('knowledge_base_name_index', name) ) @@ -376,12 +367,6 @@ class DocumentEntity(Base): onupdate=func.current_timestamp() ) __table_args__ = ( - Index("document_id_index", id), - Index("document_team_id_index", team_id), - Index("document_kb_id_index", kb_id), - Index("document_author_id_index", author_id), - Index("document_author_name_index", author_name), - Index("document_name_index", name), Index('abstract_ts_vector_index', abstract_ts_vector, postgresql_using='gin'), Index( @@ -427,10 +412,6 @@ class ChunkEntity(Base): server_default=func.current_timestamp(), onupdate=func.current_timestamp()) __table_args__ = ( - Index("chunk_id_index", id), - Index("chunk_team_id_index", team_id), - Index("chunk_kb_id_index", kb_id), - Index("chunk_doc_id_index", doc_id), Index('text_ts_vector_index', text_ts_vector, postgresql_using='gin'), Index( 'text_vector_index', @@ -464,7 +445,6 @@ class ImageEntity(Base): # 添加索引 __table_args__ = ( - Index('image_id_index', id), Index('image_team_id_index', team_id), Index('image_doc_id_index', doc_id), Index('image_chunk_id_index', chunk_id) @@ -501,7 +481,6 @@ class DataSetEntity(Base): # 添加索引 __table_args__ = ( - Index('dataset_id_index', id), Index('dataset_team_id_index', team_id), Index('dataset_kb_id_index', kb_id), Index('dataset_name_index', name) @@ -527,7 +506,6 @@ class DataSetDocEntity(Base): # 添加索引 __table_args__ = ( - Index('dataset_doc_id_index', id), Index('dataset_doc_dataset_id_index', dataset_id), Index('dataset_doc_doc_id_index', doc_id) ) @@ -557,7 +535,6 @@ class QAEntity(Base): ) # 添加索引 __table_args__ = ( - Index('qa_id_index', id), Index('qa_dataset_id_index', dataset_id), Index('qa_doc_id_index', doc_id) ) @@ -600,7 +577,6 @@ class TestingEntity(Base): # 添加索引 __table_args__ = ( - Index('testing_id_index', id), Index('testing_team_id_index', team_id), Index('testing_kb_id_index', kb_id), Index('testing_dataset_id_index', dataset_id) @@ -640,8 +616,7 @@ class TestCaseEntity(Base): # 添加索引 __table_args__ = ( - Index('testcase_id_index', id), - Index('testcase_testing_id_index', testing_id) + Index('testcase_testing_id_index', testing_id), ) @@ -669,7 +644,6 @@ class TaskEntity(Base): # 添加索引 __table_args__ = ( - Index('task_id_index', id), Index('task_team_id_index', team_id), Index('task_user_id_index', user_id), Index('task_op_id_index', op_id), @@ -699,8 +673,7 @@ class TaskReportEntity(Base): # 添加索引 __table_args__ = ( - Index('task_report_id_index', id), - Index('task_report_task_id_index', task_id) + Index('task_report_task_id_index', task_id), ) @@ -716,7 +689,6 @@ class TaskQueueEntity(Base): ) # 添加索引以提高查询性能 __table_args__ = ( - Index('idx_task_queue_status', 'status'), Index('idx_task_queue_created_time', 'created_time'), ) @@ -735,6 +707,7 @@ class DataBase: pool_size = os.cpu_count() if pool_size is None: pool_size = 5 + logging.error(f"Database pool size set to: {pool_size}") engine = create_async_engine( database_url, echo=False,