diff --git a/data_chain/apps/router/chunk.py b/data_chain/apps/router/chunk.py index a32d9cfbeb26557259d760bedfd12b382d0db03b..e945682481643b488b9fc3dd8bc667044214039e 100644 --- a/data_chain/apps/router/chunk.py +++ b/data_chain/apps/router/chunk.py @@ -57,7 +57,7 @@ async def update_chunk_by_id(user_sub: Annotated[str, Depends(get_user_sub)], chunk_id: Annotated[UUID, Query(alias="chunkId")], req: Annotated[UpdateChunkRequest, Body()]): if not (await ChunkService.validate_user_action_to_chunk(user_sub, chunk_id, action)): - raise ChunkPermissionDeniedException("访问该文档的分片", str(req.doc_id)) + raise ChunkPermissionDeniedException("更新分片", str(chunk_id)) chunk_id = await ChunkService.update_chunk_by_id(chunk_id, req) await TeamService.add_team_msg(user_sub, chunk_id, IdType.CHUNK, MessageLevel.INFO, '更新了知识库{kbName}的文档《{documentName}》的分片', 'knowledge base {kbName} Document {documentName} chunk updated') return UpdateChunkResponse(result=chunk_id) @@ -70,7 +70,7 @@ async def update_chunk_enabled_by_id(user_sub: Annotated[str, Depends(get_user_s enabled: Annotated[bool, Query()]): for chunk_id in chunk_ids: if not (await ChunkService.validate_user_action_to_chunk(user_sub, chunk_id, action)): - raise ChunkPermissionDeniedException("访问该文档的分片", str(req.doc_id)) + raise ChunkPermissionDeniedException("访问分片", str(chunk_id)) chunk_ids = await ChunkService.update_chunks_enabled_by_id(chunk_ids, enabled) for chunk_id in chunk_ids: await TeamService.add_team_msg(user_sub, chunk_id, IdType.CHUNK, MessageLevel.INFO, '更新了知识库{kbName}的文档《{documentName}》的分片启用状态', 'knowledge base {kbName} Document {documentName} chunk enabled status updated') diff --git a/data_chain/apps/router/dataset.py b/data_chain/apps/router/dataset.py index 9e3d51c65aeb0ee4595220f52db5d543b9ea60f1..6c48344b8a7176d9b646008b1a17848e3ce76325 100644 --- a/data_chain/apps/router/dataset.py +++ b/data_chain/apps/router/dataset.py @@ -72,7 +72,7 @@ async def is_dataset_have_testing( action: Annotated[str, Depends(get_route_info)], dataset_id: Annotated[UUID, Query(alias="datasetId")]): if not (await DataSetService.validate_user_action_to_dataset(user_sub, dataset_id, action)): - raise DatasetPermissionDeniedException("访问该数据集的数据", str(req.dataset_id)) + raise DatasetPermissionDeniedException("访问该数据集的数据", str(dataset_id)) is_dataset_have_testing_response = await DataSetService.is_dataset_have_testing(dataset_id) return IsDatasetHaveTestingResponse(result=is_dataset_have_testing_response) @@ -138,7 +138,7 @@ async def export_dataset_by_dataset_ids( dataset_ids: Annotated[list[UUID], Query(alias="datasetIds")]): for dataset_id in dataset_ids: if not (await DataSetService.validate_user_action_to_dataset(user_sub, dataset_id, action)): - raise DatasetPermissionDeniedException("访问该数据集的数据", str(req.dataset_id)) + raise DatasetPermissionDeniedException("访问该数据集的数据", str(dataset_id)) dataset_export_task_ids = await DataSetService.export_dataset(dataset_ids) for dataset_id in dataset_ids: await TeamService.add_team_msg(user_sub, dataset_id, IdType.DATASET, MessageLevel.INFO, '导出了知识库{kbName}的数据集{datasetName}', 'knowledge base {kbName} Dataset {datasetName} exported') @@ -165,7 +165,7 @@ async def update_dataset_by_dataset_id( database_id: Annotated[UUID, Query(alias="databaseId")], req: Annotated[UpdateDatasetRequest, Body(...)]): if not (await DataSetService.validate_user_action_to_dataset(user_sub, database_id, action)): - raise DatasetPermissionDeniedException("访问该数据集", str(dataset_id)) + raise DatasetPermissionDeniedException("访问该数据集", str(database_id)) database_id = await DataSetService.update_dataset_by_dataset_id(database_id, req) await TeamService.add_team_msg(user_sub, database_id, IdType.DATASET, MessageLevel.INFO, '更新了知识库{kbName}的数据集{datasetName}', 'knowledge base {kbName} Dataset {datasetName} updated') return UpdateDatasetResponse(result=database_id) @@ -191,7 +191,7 @@ async def delete_dataset_by_dataset_ids( database_ids: Annotated[list[UUID], Body(alias="databaseId")]): for database_id in database_ids: if not (await DataSetService.validate_user_action_to_dataset(user_sub, database_id, action)): - raise DatasetPermissionDeniedException("访问该数据集", str(dataset_id)) + raise DatasetPermissionDeniedException("访问该数据集", str(database_id)) dataset_ids = await DataSetService.delete_dataset_by_dataset_ids(database_ids) for dataset_id in dataset_ids: await TeamService.add_team_msg(user_sub, dataset_id, IdType.DATASET, MessageLevel.WARNING, '删除了知识库{kbName}的数据集{datasetName}', 'knowledge base {kbName} Dataset {datasetName} deleted') @@ -205,7 +205,7 @@ async def delete_data_by_data_ids( data_ids: Annotated[list[UUID], Body(alias="dataIds")]): for data_id in data_ids: if not (await DataSetService.validate_user_action_to_data(user_sub, data_id, action)): - raise DatasetPermissionDeniedException("访问该数据集的数据", str(req.dataset_id)) + raise DatasetPermissionDeniedException("访问该数据集的数据", str(data_id)) for data_id in data_ids: await TeamService.add_team_msg(user_sub, data_id, IdType.DATASET_DATA, MessageLevel.WARNING, '删除了知识库{kbName}的数据集{datasetName}的数据', 'knowledge base {kbName} Dataset {datasetName} data deleted') await DataSetService.delete_data_by_data_ids(data_ids) diff --git a/data_chain/apps/router/document.py b/data_chain/apps/router/document.py index 93178a0a757ba2604875219d57a5e3b8fbca48a8..e9c5555aa274e64d31c7277c6c5e863724d3c46b 100644 --- a/data_chain/apps/router/document.py +++ b/data_chain/apps/router/document.py @@ -62,7 +62,7 @@ async def download_doc_by_id( action: Annotated[str, Depends(get_route_info)], doc_id: Annotated[UUID, Query(alias="docId")]): if not (await DocumentService.validate_user_action_to_document(user_sub, doc_id, action)): - raise DocumentPermissionDeniedException("访问该文档", str(doc_id)) + raise DocumentPermissionDeniedException("下载该文档", str(doc_id)) document_link_url = await DocumentService.generate_doc_download_url(doc_id) document_name, extension = await DocumentService.get_doc_name_and_extension(doc_id) async with AsyncClient() as async_client: @@ -89,8 +89,7 @@ async def get_doc_report( action: Annotated[str, Depends(get_route_info)], doc_id: Annotated[UUID, Query(alias="docId")]): if not (await DocumentService.validate_user_action_to_document(user_sub, doc_id, action)): - - raise DocumentPermissionDeniedException("访问该文档", str(doc_id)) + raise DocumentPermissionDeniedException("访问该文档的解析报告", str(doc_id)) task_report = await DocumentService.get_doc_report(doc_id) await TeamService.add_team_msg(user_sub, doc_id, IdType.DOCUMENT, MessageLevel.INFO, '查看了知识库{kbName}的文档《{documentName}》的解析报告', 'knowledge base {kbName} Document <{documentName}> report viewed') return GetDocumentReportResponse(result=task_report) diff --git a/data_chain/apps/router/knowledge_base.py b/data_chain/apps/router/knowledge_base.py index 58893f1f4c0bf02a803e354078ccbe1768692368..3dd587b0d25ec6f00fda5e28b71d2adb509605d9 100644 --- a/data_chain/apps/router/knowledge_base.py +++ b/data_chain/apps/router/knowledge_base.py @@ -138,7 +138,7 @@ async def export_kb_by_kb_ids( kb_ids: Annotated[list[UUID], Query(alias="kbIds")]): for kb_id in kb_ids: if not await KnowledgeBaseService.validate_user_action_to_knowledge_base(user_sub, kb_id, action): - raise KnowledgeBasePermissionDeniedException("在该知识库导出知识库", str(kb_id)) + raise KnowledgeBasePermissionDeniedException("导出该知识库", str(kb_id)) kb_export_task_ids = await KnowledgeBaseService.export_kb_by_kb_ids(kb_ids) for kb_id in kb_ids: await TeamService.add_team_msg(user_sub, kb_id, IdType.KNOWLEDGE_BASE, MessageLevel.INFO, '导出了知识库{kbName}', 'knowledge base {kbName} exported') @@ -152,7 +152,7 @@ async def update_kb_by_kb_id( kb_id: Annotated[UUID, Query(alias="kbId")], req: Annotated[UpdateKnowledgeBaseRequest, Body()]): if not await KnowledgeBaseService.validate_user_action_to_knowledge_base(user_sub, kb_id, action): - raise KnowledgeBasePermissionDeniedException("在该知识库更新知识库", str(kb_id)) + raise KnowledgeBasePermissionDeniedException("更新该知识库", str(kb_id)) kb_id = await KnowledgeBaseService.update_kb_by_kb_id(kb_id, req) await TeamService.add_team_msg(user_sub, kb_id, IdType.KNOWLEDGE_BASE, MessageLevel.INFO, '更新了知识库{kbName}', 'knowledge base {kbName} updated') return UpdateKnowledgeBaseResponse(result=kb_id) @@ -165,7 +165,7 @@ async def delete_kb_by_kb_ids( kb_ids: Annotated[list[UUID], Body(alias="kbIds")]): for kb_id in kb_ids: if not await KnowledgeBaseService.validate_user_action_to_knowledge_base(user_sub, kb_id, action): - raise KnowledgeBasePermissionDeniedException("在该知识库删除知识库", str(kb_id)) + raise KnowledgeBasePermissionDeniedException("删除该知识库", str(kb_id)) for kb_id in kb_ids: await TeamService.add_team_msg(user_sub, kb_id, IdType.KNOWLEDGE_BASE, MessageLevel.WARNING, '删除了知识库{kbName}', 'knowledge base {kbName} deleted') kb_ids_deleted = await KnowledgeBaseService.delete_kb_by_kb_ids(kb_ids) diff --git a/data_chain/apps/router/task.py b/data_chain/apps/router/task.py index 27eeab432613fcb4b0c2ffae1d004cfc6bd21277..dd26a4a5832b1c1491705839807ae748ecff2bf4 100644 --- a/data_chain/apps/router/task.py +++ b/data_chain/apps/router/task.py @@ -43,7 +43,7 @@ async def delete_task_by_task_id( task_id: Annotated[UUID, Query(alias="taskId")], ): if not (await TaskService.validate_user_action_to_task(user_sub, task_id, action)): - raise TaskPermissionDeniedException("访问该团队的任务", str(req.team_id)) + raise TaskPermissionDeniedException("删除该任务", str(task_id)) task_id = await TaskService.delete_task_by_task_id(task_id) return DeleteTaskByIdResponse(message='任务删除成功', result=task_id) @@ -56,6 +56,6 @@ async def delete_task_by_task_type( task_type: Annotated[TaskType, Query(alias="taskType")], ): if not (await TeamService.validate_user_action_in_team(user_sub, team_id, action)): - raise TaskPermissionDeniedException("访问该团队的任务", str(req.team_id)) + raise TaskPermissionDeniedException("删除该团队的任务", str(team_id)) task_ids = await TaskService.delete_task_by_type(user_sub, team_id, task_type) return DeleteTaskByTypeResponse(message='任务删除成功', result=task_ids) diff --git a/data_chain/rerank/rerank.py b/data_chain/rerank/rerank.py index efbdb1ba8f4acf3790183b30d1351976bdfaaaf6..35ba9b688e8f3fd3ed832f5fb91991a966f27b87 100644 --- a/data_chain/rerank/rerank.py +++ b/data_chain/rerank/rerank.py @@ -75,4 +75,3 @@ class Rerank(): return documents[:top_k] documents_index = await Rerank.parse_response(response, top_k) return documents_index - diff --git a/data_chain/stores/database/database.py b/data_chain/stores/database/database.py index 3f85a54ace9879cb0e6ddec9c70cd87edfec306e..0eb976dfc0dd03ffab81042a4bfb6d1c23b02664 100644 --- a/data_chain/stores/database/database.py +++ b/data_chain/stores/database/database.py @@ -7,7 +7,7 @@ from uuid import uuid4 import urllib.parse from data_chain.logger.logger import logger as logging from pgvector.sqlalchemy import Vector -from sqlalchemy import Boolean, Column, ForeignKey, Integer, Float, String, func +from sqlalchemy import Boolean, Column, ForeignKey, BigInteger, Float, String, func from sqlalchemy.types import TIMESTAMP, UUID from sqlalchemy.dialects.postgresql import TSVECTOR from sqlalchemy.orm import declarative_base, DeclarativeBase, MappedAsDataclass, Mapped, mapped_column @@ -49,7 +49,7 @@ class TeamEntity(Base): author_name = Column(String) name = Column(String) description = Column(String) - member_cnt = Column(Integer, default=0) + member_cnt = Column(BigInteger, default=0) is_public = Column(Boolean, default=True) status = Column(String, default=TeamStatus.EXISTED.value) created_time = Column( @@ -239,13 +239,13 @@ class KnowledgeBaseEntity(Base): rerank_method = Column(String) rerank_name = Column(String) spearating_characters = Column(String) # 资产分块的分隔符 - doc_cnt = Column(Integer, default=0) # 资产文档个数 - doc_size = Column(Integer, default=0) # 资产下所有文档大小(TODO: 单位kb或者字节) - upload_count_limit = Column(Integer, default=128) # 更新次数限制 - upload_size_limit = Column(Integer, default=512) # 更新大小限制 + doc_cnt = Column(BigInteger, default=0) # 资产文档个数 + doc_size = Column(BigInteger, default=0) # 资产下所有文档大小(TODO: 单位kb或者字节) + upload_count_limit = Column(BigInteger, default=128) # 更新次数限制 + upload_size_limit = Column(BigInteger, default=512) # 更新大小限制 default_parse_method = Column( String, default=ParseMethod.GENERAL.value) # 默认解析方法 - default_chunk_size = Column(Integer, default=1024) # 默认分块大小 + default_chunk_size = Column(BigInteger, default=1024) # 默认分块大小 status = Column(String, default=KnowledgeBaseStatus.IDLE.value) created_time = Column( TIMESTAMP(timezone=True), @@ -289,11 +289,11 @@ class DocumentEntity(Base): author_name = Column(String) # 文档作者名称 name = Column(String) # 文档名 extension = Column(String) # 文件后缀 - size = Column(Integer) # 文档大小 + size = Column(BigInteger) # 文档大小 parse_method = Column(String, default=ParseMethod.GENERAL.value) # 文档解析方法 parse_relut_topology = Column( String, default=DocParseRelutTopology.LIST.value) # 文档解析结果拓扑结构 - chunk_size = Column(Integer) # 文档分块大小 + chunk_size = Column(BigInteger) # 文档分块大小 type_id = Column(UUID) # 文档类别 enabled = Column(Boolean) # 文档是否启用 status = Column(String, default=DocumentStatus.IDLE.value) # 文档状态 @@ -336,15 +336,15 @@ class ChunkEntity(Base): text = Column(String) # 片段文本内容 text_ts_vector = Column(TSVECTOR) # 片段文本词向量 text_vector = Column(Vector(1024)) # 文本向量 - tokens = Column(Integer) # 片段文本token数 + tokens = Column(BigInteger) # 片段文本token数 type = Column(String, default=ChunkType.TEXT.value) # 片段类型 # 前一个chunk的id(假如解析结果为链表,那么这里是前一个节点的id,如果文档解析结果为树,那么这里是父节点的id) pre_id_in_parse_topology = Column(UUID) # chunk的在解析结果中的拓扑类型(假如解析结果为链表,那么这里为链表头、中间和尾;假如解析结果为树,那么这里为树根、树的中间节点和叶子节点) parse_topology_type = Column( String, default=ChunkParseTopology.LISTHEAD.value) - global_offset = Column(Integer) # chunk在文档中的相对偏移 - local_offset = Column(Integer) # chunk在块中的相对偏移 + global_offset = Column(BigInteger) # chunk在文档中的相对偏移 + local_offset = Column(BigInteger) # chunk在块中的相对偏移 enabled = Column(Boolean) # chunk是否启用 status = Column(String, default=ChunkStatus.EXISTED.value) # chunk状态 created_time = Column( @@ -400,7 +400,7 @@ class DataSetEntity(Base): llm_id = Column(String) # 数据的生成使用的大模型的id name = Column(String, nullable=False) # 数据集名称 description = Column(String) # 数据集描述 - data_cnt = Column(Integer) # 数据集数据量 + data_cnt = Column(BigInteger) # 数据集数据量 is_data_cleared = Column(Boolean, default=False) # 数据集是否清洗 is_chunk_related = Column(Boolean, default=False) # 数据集是否关联上下文 is_imported = Column(Boolean, default=False) # 数据集是否导入 @@ -477,7 +477,7 @@ class TestingEntity(Base): llm_id = Column(String) # 测试任务的使用的大模型 search_method = Column( String, default=SearchMethod.KEYWORD_AND_VECTOR.value) # 测试任务的使用的检索增强模式类型 - top_k = Column(Integer, default=5) # 测试任务的检索增强模式的top_k + top_k = Column(BigInteger, default=5) # 测试任务的检索增强模式的top_k status = Column(String, default=TestingStatus.IDLE.value) # 测试任务的状态 ave_score = Column(Float, default=-1) # 测试任务的综合得分 ave_pre = Column(Float, default=-1) # 测试任务的平均召回率 @@ -542,7 +542,7 @@ class TaskEntity(Base): op_id = Column(UUID) # 任务关联的实体id, 资产或者文档id op_name = Column(String) # 任务关联的实体名称 type = Column(String) # 任务类型 - retry = Column(Integer) # 重试次数 + retry = Column(BigInteger) # 重试次数 status = Column(String) # 任务状态 created_time = Column( TIMESTAMP(timezone=True), @@ -562,8 +562,8 @@ class TaskReportEntity(Base): id = Column(UUID, default=uuid4, primary_key=True) # 任务报告的id task_id = Column(UUID, ForeignKey('task.id', ondelete="CASCADE")) # 任务id message = Column(String) # 任务报告信息 - current_stage = Column(Integer) # 任务当前阶段 - stage_cnt = Column(Integer) # 任务总的阶段 + current_stage = Column(BigInteger) # 任务当前阶段 + stage_cnt = Column(BigInteger) # 任务总的阶段 created_time = Column( TIMESTAMP(timezone=True), nullable=True,