소스 검색

知识库代码融合

yuchengwei 3 일 전
부모
커밋
d2195b9be8
6개의 변경된 파일1424개의 추가작업 그리고 1개의 파일을 삭제
  1. 44 0
      agent/models/web/knowledge_base.py
  2. 481 0
      agent/router/knowledge_base_router.py
  3. 4 1
      agent/server.py
  4. 374 0
      agent/utils.py
  5. 29 0
      config/site.py
  6. 492 0
      openapi.yaml

+ 44 - 0
agent/models/web/knowledge_base.py

@@ -0,0 +1,44 @@
+from datetime import datetime
+from typing import List
+from sqlalchemy import Column, Integer, String, Text, Boolean, DateTime, ForeignKey, Float
+from sqlalchemy.orm import relationship, declarative_base
+
+Base = declarative_base()
+
+class KnowledgeBase(Base):
+    __tablename__ = 'knowledge_base'
+
+    id = Column(Integer, primary_key=True, index=True)
+    name = Column(String(50), unique=True, index=True, nullable=False)
+    description = Column(Text, nullable=True)
+    tags = Column(String(200), nullable=True)
+    creator = Column(String(100), nullable=True)
+    file_count = Column(Integer, default=0)  # 文件数量
+    is_deleted = Column(Integer, default=0)
+    created_at = Column(DateTime, default=datetime.utcnow)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+
+    # 建立与KnowledgeFile的一对多关系
+    files = relationship('KnowledgeFile', back_populates='knowledge_base')
+
+class KnowledgeFile(Base):
+    __tablename__ = 'knowledge_file'
+
+    id = Column(Integer, primary_key=True, index=True)
+    knowledge_base_id = Column(Integer, ForeignKey('knowledge_base.id'), nullable=False)
+    file_name = Column(String(255), nullable=False)
+    file_size = Column(Float, nullable=False)  # 文件大小(MB)
+    file_type = Column(String(10), nullable=False)  # 文件扩展名
+    minio_url = Column(String(500), nullable=False)  # MinIO存储路径
+    version = Column(String(50), nullable=True)
+    author = Column(String(100), nullable=True)
+    year = Column(Integer, nullable=True)
+    page_count = Column(Integer, nullable=True)  # 文档页数
+    creator = Column(String(100), nullable=True)  # 创建人
+    knowledge_type = Column(String(50), nullable=True)  # 知识类型
+    is_deleted = Column(Integer, default=0)
+    created_at = Column(DateTime, default=datetime.utcnow)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+
+    # 建立与KnowledgeBase的多对一关系
+    knowledge_base = relationship('KnowledgeBase', back_populates='files')

+ 481 - 0
agent/router/knowledge_base_router.py

@@ -0,0 +1,481 @@
+import os
+import io
+import logging
+import urllib.parse
+import time
+import glob
+import shutil
+import subprocess
+from typing import List, Optional
+from datetime import datetime
+from fastapi import APIRouter, FastAPI, Depends, HTTPException, UploadFile, File, Form
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.openapi.docs import (
+    get_redoc_html,
+    get_swagger_ui_html,
+    get_swagger_ui_oauth2_redirect_html,
+)
+from sqlalchemy import create_engine
+from sqlalchemy.orm import Session, sessionmaker
+from sqlalchemy.ext.declarative import declarative_base
+from pydantic import BaseModel, ConfigDict, Field, field_serializer
+from agent.models.web.knowledge_base import Base, KnowledgeBase, KnowledgeFile
+from agent.utils import DatabaseUtils, MinioUtils, FileUtils
+from config.site import settings
+
+
+# 响应模型
+class ResponseModel(BaseModel):
+    code: int
+    message: str
+    data: Optional[dict | list | bool | None]
+
+
+class KnowledgeBaseResponse(BaseModel):
+    model_config = ConfigDict(from_attributes=True)
+
+    id: int
+    name: str
+    description: Optional[str] = None
+    tags: Optional[str] = None
+    creator: Optional[str] = None
+    file_count: int = 0
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    updated_at: datetime = Field(default_factory=datetime.utcnow)
+
+    @field_serializer('created_at', 'updated_at')
+    def serialize_datetime(self, dt: datetime) -> str:
+        return dt.strftime('%Y-%m-%d')
+
+
+class KnowledgeFileResponse(BaseModel):
+    model_config = ConfigDict(from_attributes=True)
+
+    id: int
+    knowledge_base_id: int
+    file_name: str
+    file_size: float
+    file_type: str
+    minio_url: str
+    version: Optional[str] = None
+    author: Optional[str] = None
+    year: Optional[int] = None
+    page_count: Optional[int] = None
+    creator: Optional[str] = None
+    knowledge_type: Optional[str] = None
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    updated_at: datetime = Field(default_factory=datetime.utcnow)
+
+    @field_serializer('created_at', 'updated_at')
+    def serialize_datetime(self, dt: datetime) -> str:
+        return dt.strftime('%Y-%m-%d %H:%M')
+
+
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# 创建数据库引擎
+engine = create_engine(settings.DATABASE_URL)
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+
+# 创建数据库表
+Base.metadata.create_all(bind=engine)
+
+router = APIRouter(tags=["knowledge base interface"])
+# logger = logging.getLogger(__name__)
+# config = SiteConfig()
+
+# 初始化MinIO工具类
+minio_utils = MinioUtils()
+
+
+# 全局异常处理
+# @router.exception_handler(Exception)
+# async def global_exception_handler(request, exc):
+#     logger.error(f"全局异常: {exc}", exc_info=True)
+#     return {
+#         "code": 500,
+#         "message": "服务器内部错误",
+#         "data": None
+#     }
+
+
+# 依赖项:获取数据库会话
+def get_db():
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()
+
+
+# 请求模型
+class KnowledgeBaseCreate(BaseModel):
+    name: str
+    description: Optional[str] = None
+    tags: Optional[str] = None
+
+
+class KnowledgeBaseUpdate(BaseModel):
+    name: str
+    description: Optional[str] = None
+    tags: Optional[str] = None
+
+
+class FileUpdate(BaseModel):
+    id: int
+    file_name: Optional[str] = None
+    version: Optional[str] = None
+    author: Optional[str] = None
+    year: Optional[int] = None
+    page_count: Optional[int] = None
+    creator: Optional[str] = None
+    knowledge_type: Optional[str] = None
+
+
+class BatchFileUpdate(BaseModel):
+    files: List[FileUpdate]
+
+
+# 使用utils.py中的FileUtils类进行文件转换
+
+@router.post("/knowledge-base/", response_model=ResponseModel)
+def create_knowledge_base(kb: KnowledgeBaseCreate, db: Session = Depends(get_db)):
+    kb_data = DatabaseUtils.create_knowledge_base(db, kb.name, kb.description, kb.tags)
+    return ResponseModel(
+        code=200,
+        message="创建成功",
+        data=KnowledgeBaseResponse.model_validate(kb_data).model_dump()
+    )
+
+
+@router.put("/knowledge-base/{kb_id}", response_model=ResponseModel)
+def update_knowledge_base(kb_id: int, kb: KnowledgeBaseUpdate, db: Session = Depends(get_db)):
+    kb_data = DatabaseUtils.update_knowledge_base(db, kb_id, kb.name, kb.description, kb.tags)
+    return ResponseModel(
+        code=200,
+        message="更新成功",
+        data=KnowledgeBaseResponse.model_validate(kb_data).model_dump()
+    )
+
+
+@router.delete("/knowledge-base/{kb_id}", response_model=ResponseModel)
+def delete_knowledge_base(kb_id: int, db: Session = Depends(get_db)):
+    result = DatabaseUtils.delete_knowledge_base(db, kb_id)
+    return ResponseModel(
+        code=200,
+        message="删除成功",
+        data=result
+    )
+
+
+@router.get("/knowledge-base/{kb_id}", response_model=ResponseModel)
+def get_knowledge_base(kb_id: int, db: Session = Depends(get_db)):
+    kb = db.query(KnowledgeBase).filter(KnowledgeBase.id == kb_id, KnowledgeBase.is_deleted == 0).first()
+    if not kb:
+        raise HTTPException(status_code=404, detail="知识库不存在")
+    return ResponseModel(
+        code=200,
+        message="查询成功",
+        data=KnowledgeBaseResponse.model_validate(kb).model_dump()
+    )
+
+
+@router.get("/knowledge-base/", response_model=ResponseModel)
+def list_knowledge_bases(pageNo: int = 1, pageSize: int = 10, name: Optional[str] = None,
+                         db: Session = Depends(get_db)):
+    if pageNo < 1:
+        raise HTTPException(status_code=400, detail="页码必须大于等于1")
+    if pageSize < 1:
+        raise HTTPException(status_code=400, detail="每页条数必须大于等于1")
+    skip = (pageNo - 1) * pageSize
+    kb_list, total = DatabaseUtils.get_knowledge_bases(db, skip, pageSize, name)
+    return ResponseModel(
+        code=200,
+        message="查询成功",
+        data={
+            "list": [KnowledgeBaseResponse.model_validate(kb).model_dump() for kb in kb_list],
+            "total": total
+        }
+    )
+
+
+@router.get("/knowledge-base/name/{name}", response_model=ResponseModel)
+def get_knowledge_base_by_name(name: str, db: Session = Depends(get_db)):
+    kb = DatabaseUtils.get_knowledge_base_by_name(db, name)
+    if not kb:
+        raise HTTPException(status_code=404, detail="知识库不存在")
+    return ResponseModel(
+        code=200,
+        message="查询成功",
+        data=KnowledgeBaseResponse.model_validate(kb).model_dump()
+    )
+
+
+@router.post("/knowledge-base/{kb_id}/files/", response_model=ResponseModel)
+async def upload_files(
+        kb_id: int,
+        files: List[UploadFile] = File(...),
+        db: Session = Depends(get_db)
+):
+    # 验证知识库是否存在
+    kb = db.query(KnowledgeBase).filter(KnowledgeBase.id == kb_id, KnowledgeBase.is_deleted == 0).first()
+    if not kb:
+        raise HTTPException(status_code=404, detail="知识库不存在")
+
+    # 验证文件数量
+    if len(files) > settings.MAX_FILE_COUNT:
+        raise HTTPException(status_code=400, detail=f"单次上传文件数量不能超过{settings.MAX_FILE_COUNT}个")
+
+    # 导入所需模块
+    import tempfile
+
+    uploaded_files = []
+    for file in files:
+        # 获取文件扩展名
+        file_ext = os.path.splitext(file.filename)[1].lower().lstrip('.')
+        original_filename = file.filename
+        converted_content = None
+
+        # 读取文件内容
+        content = await file.read()
+
+        # 处理需要转换的文件格式
+        if file_ext in ["doc", "ppt"]:
+            # 创建临时目录用于文件转换
+            with tempfile.TemporaryDirectory() as temp_dir:
+                # 创建临时文件
+                temp_input_path = os.path.join(temp_dir, original_filename)
+                with open(temp_input_path, "wb") as temp_file:
+                    temp_file.write(content)
+
+                # 确定目标格式
+                target_format = "docx" if file_ext == "doc" else "pptx"
+
+                # 使用FileUtils中的文件转换方法
+                converted_file_path = FileUtils.convert_office_file(temp_input_path, temp_dir, target_format)
+
+                if converted_file_path and os.path.exists(converted_file_path):
+                    # 读取转换后的文件内容
+                    with open(converted_file_path, "rb") as converted_file:
+                        converted_content = converted_file.read()
+
+                    # 更新文件名和扩展名
+                    file_ext = target_format
+                    file.filename = os.path.splitext(original_filename)[0] + f".{target_format}"
+                else:
+                    # 转换失败,使用原始文件
+                    raise HTTPException(status_code=500, detail=f"文件格式转换失败: {original_filename}")
+
+        # 检查文件格式是否支持
+        if file_ext not in settings.ALLOWED_EXTENSIONS:
+            raise HTTPException(status_code=400, detail=f"不支持的文件格式:{file_ext}")
+
+        # 使用转换后的内容或原始内容
+        file_content = converted_content if converted_content else content
+        file_size = len(file_content) / (1024 * 1024)  # 转换为MB
+
+        # 验证文件大小
+        max_size = settings.ALLOWED_EXTENSIONS[file_ext]["max_size"]
+        if file_size > max_size:
+            raise HTTPException(status_code=400, detail=f"{original_filename}超过最大允许大小{max_size}MB")
+
+        # 上传到MinIO
+        minio_url = minio_utils.upload_file(file_content, file.filename, file.content_type)
+
+        # 从文件名识别知识类型
+        knowledge_type = None
+        if '指南' in file.filename:
+            knowledge_type = '指南'
+        elif '教材' in file.filename:
+            knowledge_type = '教材'
+
+        # 创建文件记录
+        db_file = KnowledgeFile(
+            knowledge_base_id=kb_id,
+            file_name=file.filename,
+            file_size=file_size,
+            file_type=file_ext,
+            minio_url=minio_url,
+            creator=kb.creator,  # 使用知识库的创建人作为文件创建人
+            knowledge_type=knowledge_type
+        )
+        db.add(db_file)
+        uploaded_files.append(db_file)
+        # 更新知识库文件计数
+        DatabaseUtils.increment_file_count(db, kb_id)
+
+    db.commit()
+    return ResponseModel(
+        code=200,
+        message="上传成功",
+        data=[KnowledgeFileResponse.model_validate(file).model_dump() for file in uploaded_files]
+    )
+
+
+@router.get("/knowledge-base/{kb_id}/files/", response_model=ResponseModel)
+def list_files(kb_id: int, pageNo: int = 1, pageSize: int = 10, file_name: Optional[str] = None,
+               db: Session = Depends(get_db)):
+    if pageNo < 1:
+        raise HTTPException(status_code=400, detail="页码必须大于等于1")
+    if pageSize < 1:
+        raise HTTPException(status_code=400, detail="每页条数必须大于等于1")
+
+    skip = (pageNo - 1) * pageSize
+    query = db.query(KnowledgeFile).filter(
+        KnowledgeFile.knowledge_base_id == kb_id,
+        KnowledgeFile.is_deleted == 0
+    )
+
+    if file_name:
+        query = query.filter(KnowledgeFile.file_name.ilike(f"%{file_name}%"))
+
+    total = query.count()
+    files = query.offset(skip).limit(pageSize).all()
+
+    return ResponseModel(
+        code=200,
+        message="查询成功",
+        data={
+            "list": [KnowledgeFileResponse.model_validate(file).model_dump() for file in files],
+            "total": total
+        }
+    )
+
+
+@router.get("/knowledge-base/{kb_id}/files/search/", response_model=ResponseModel)
+def search_files(kb_id: int, file_name: str, db: Session = Depends(get_db)):
+    files = db.query(KnowledgeFile).filter(
+        KnowledgeFile.knowledge_base_id == kb_id,
+        KnowledgeFile.file_name.ilike(f"%{file_name}%"),
+        KnowledgeFile.is_deleted == 0
+    ).all()
+    return ResponseModel(
+        code=200,
+        message="查询成功",
+        data=[KnowledgeFileResponse.model_validate(file).model_dump() for file in files]
+    )
+
+
+@router.get("/files/{file_id}/download")
+def download_file(file_id: int, db: Session = Depends(get_db)):
+    # 获取文件信息
+    file = db.query(KnowledgeFile).filter(
+        KnowledgeFile.id == file_id,
+        KnowledgeFile.is_deleted == 0
+    ).first()
+    if not file:
+        raise HTTPException(status_code=404, detail="文件不存在")
+
+    # 从MinIO下载文件
+    object_name = file.minio_url.split("/")[-1]
+    file_content = minio_utils.download_file(object_name)
+
+    # 创建文件流
+    file_stream = io.BytesIO(file_content)
+
+    # 对文件名进行URL编码
+    encoded_filename = urllib.parse.quote(file.file_name)
+
+    return StreamingResponse(
+        file_stream,
+        media_type="application/octet-stream",
+        headers={
+            "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}"
+        }
+    )
+
+
+@router.delete("/files/{file_id}", response_model=dict)
+def delete_file(file_id: int, db: Session = Depends(get_db)):
+    # 获取文件信息
+    file = db.query(KnowledgeFile).filter(
+        KnowledgeFile.id == file_id,
+        KnowledgeFile.is_deleted == 0
+    ).first()
+    if not file:
+        raise HTTPException(status_code=404, detail="文件不存在")
+
+    # 从MinIO删除文件
+    object_name = file.minio_url.split("/")[-1]
+    minio_utils.delete_file(object_name)
+
+    # 标记文件为已删除
+    file.is_deleted = 1
+    file.updated_at = datetime.utcnow()
+    # 更新知识库文件计数
+    DatabaseUtils.decrement_file_count(db, file.knowledge_base_id)
+    db.commit()
+
+    return {
+        "code": 200,
+        "message": "删除成功",
+        "data": True
+    }
+
+
+@router.put("/files/batch-update", response_model=ResponseModel)
+def batch_update_files(update_data: BatchFileUpdate, db: Session = Depends(get_db)):
+    updated_files = []
+    for file_update in update_data.files:
+        # 获取文件信息
+        file = db.query(KnowledgeFile).filter(
+            KnowledgeFile.id == file_update.id,
+            KnowledgeFile.is_deleted == 0
+        ).first()
+        if not file:
+            raise HTTPException(status_code=404, detail=f"文件ID {file_update.id} 不存在")
+
+        # 如果需要更新文件名,同时更新MinIO中的文件
+        if file_update.file_name and file_update.file_name != file.file_name:
+            old_object_name = file.minio_url.split("/")[-1]
+            new_object_name = f"{datetime.utcnow().strftime('%Y%m%d%H%M%S')}_{file_update.file_name}"
+
+            # 从MinIO下载文件
+            file_content = minio_utils.download_file(old_object_name)
+
+            # 上传到MinIO新的位置
+            new_minio_url = minio_utils.upload_file(
+                file_content,
+                file_update.file_name,
+                file.file_type
+            )
+
+            # 删除旧文件
+            minio_utils.delete_file(old_object_name)
+
+            # 更新数据库中的文件名和MinIO URL
+            file.file_name = file_update.file_name
+            file.minio_url = new_minio_url
+
+        # 更新其他字段
+        if file_update.version is not None:
+            file.version = file_update.version
+        if file_update.author is not None:
+            file.author = file_update.author
+        if file_update.year is not None:
+            file.year = file_update.year
+        if file_update.page_count is not None:
+            file.page_count = file_update.page_count
+        if file_update.creator is not None:
+            file.creator = file_update.creator
+        if file_update.knowledge_type is not None:
+            file.knowledge_type = file_update.knowledge_type
+
+        file.updated_at = datetime.utcnow()
+        updated_files.append(file)
+
+    db.commit()
+    return ResponseModel(
+        code=200,
+        message="更新成功",
+        data=[KnowledgeFileResponse.model_validate(file).model_dump() for file in updated_files]
+    )
+
+knowledge_base_router = router

+ 4 - 1
agent/server.py

@@ -35,7 +35,7 @@ handler.setLevel(logging.INFO)
 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 handler.setFormatter(formatter)
 logging.getLogger().addHandler(handler)
-app = FastAPI()
+app = FastAPI(root_path="/open_platform")
 # 允许所有来源的跨域请求
 app.add_middleware(
     CORSMiddleware,
@@ -63,4 +63,7 @@ app.include_router(kb_router)
 #app.include_router(graph_router)
 
 #app.include_router(dify_kb_router)
+
+from router.knowledge_base_router import knowledge_base_router
+app.include_router(knowledge_base_router)
 save_api_spec(app)

+ 374 - 0
agent/utils.py

@@ -0,0 +1,374 @@
+import re
+import io
+import os
+import time
+import glob
+import shutil
+import logging
+import subprocess
+from datetime import datetime
+from typing import List, Optional
+from minio import Minio
+import urllib3
+from sqlalchemy.orm import Session
+from fastapi import HTTPException
+from agent.models.web.knowledge_base import KnowledgeBase, KnowledgeFile
+from config.site import settings
+
+# 配置Office文件转换日志
+office_logger = logging.getLogger('office_conversion')
+office_logger.setLevel(logging.INFO)
+if not office_logger.handlers:
+    handler = logging.StreamHandler()
+    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    handler.setFormatter(formatter)
+    office_logger.addHandler(handler)
+
+class DatabaseUtils:
+    @staticmethod
+    def validate_knowledge_base_name(name: str) -> bool:
+        pattern = r'^[a-zA-Z0-9\u4e00-\u9fa5_\-\.]+$'
+        return bool(re.match(pattern, name))
+
+    @staticmethod
+    def create_knowledge_base(db: Session, name: str, description: Optional[str] = None, tags: Optional[str] = None) -> KnowledgeBase:
+        if not DatabaseUtils.validate_knowledge_base_name(name):
+            raise HTTPException(status_code=400, detail="知识库名称格式不正确")
+        
+        if description and len(description) > 400:
+            raise HTTPException(status_code=400, detail="知识库备注不能超过400字")
+        
+        db_kb = KnowledgeBase(name=name, description=description, tags=tags, file_count=0)
+        db.add(db_kb)
+        db.commit()
+        db.refresh(db_kb)
+        return db_kb
+
+    @staticmethod
+    def update_knowledge_base(db: Session, kb_id: int, name: str, description: Optional[str] = None, tags: Optional[str] = None) -> KnowledgeBase:
+        db_kb = db.query(KnowledgeBase).filter(KnowledgeBase.id == kb_id, KnowledgeBase.is_deleted == 0).first()
+        if not db_kb:
+            raise HTTPException(status_code=404, detail="知识库不存在")
+        
+        if name and not DatabaseUtils.validate_knowledge_base_name(name):
+            raise HTTPException(status_code=400, detail="知识库名称格式不正确")
+        
+        if description and len(description) > 400:
+            raise HTTPException(status_code=400, detail="知识库备注不能超过400字")
+        
+        db_kb.name = name
+        db_kb.description = description
+        db_kb.tags = tags
+        db_kb.updated_at = datetime.utcnow()
+        
+        db.commit()
+        db.refresh(db_kb)
+        return db_kb
+
+    @staticmethod
+    def delete_knowledge_base(db: Session, kb_id: int) -> bool:
+        db_kb = db.query(KnowledgeBase).filter(KnowledgeBase.id == kb_id, KnowledgeBase.is_deleted == 0).first()
+        if not db_kb:
+            raise HTTPException(status_code=404, detail="知识库不存在")
+        
+        # 删除知识库时将文件计数清零
+        db_kb.file_count = 0
+        db_kb.is_deleted = 1
+        db_kb.updated_at = datetime.utcnow()
+        db.commit()
+        return True
+
+    @staticmethod
+    def get_knowledge_bases(db: Session, skip: int = 0, limit: int = 10, name: Optional[str] = None) -> tuple[List[KnowledgeBase], int]:
+        query = db.query(KnowledgeBase).filter(KnowledgeBase.is_deleted == 0)
+        if name:
+            query = query.filter(KnowledgeBase.name.ilike(f"%{name}%"))
+        total = query.count()
+        knowledge_bases = query.offset(skip).limit(limit).all()
+        return knowledge_bases, total
+
+    @staticmethod
+    def get_knowledge_base_by_name(db: Session, name: str) -> Optional[KnowledgeBase]:
+        return db.query(KnowledgeBase).filter(KnowledgeBase.name == name, KnowledgeBase.is_deleted == 0).first()
+
+    @staticmethod
+    def increment_file_count(db: Session, kb_id: int) -> None:
+        db_kb = db.query(KnowledgeBase).filter(KnowledgeBase.id == kb_id, KnowledgeBase.is_deleted == 0).first()
+        if db_kb:
+            db_kb.file_count += 1
+            db.commit()
+
+    @staticmethod
+    def decrement_file_count(db: Session, kb_id: int) -> None:
+        db_kb = db.query(KnowledgeBase).filter(KnowledgeBase.id == kb_id, KnowledgeBase.is_deleted == 0).first()
+        if db_kb and db_kb.file_count > 0:
+            db_kb.file_count -= 1
+            db.commit()
+
+class MinioUtils:
+    def __init__(self):
+        self.client = Minio(
+            settings.MINIO_ENDPOINT,
+            access_key=settings.MINIO_ACCESS_KEY,
+            secret_key=settings.MINIO_SECRET_KEY,
+            secure=settings.MINIO_SECURE,
+            http_client=urllib3.PoolManager(
+                timeout=urllib3.Timeout(connect=10, read=60),
+                maxsize=50,
+                retries=urllib3.Retry(
+                    total=5,
+                    backoff_factor=0.5,
+                    status_forcelist=[500, 502, 503, 504]
+                )
+            )
+        )
+        self._ensure_bucket_exists()
+
+    def _ensure_bucket_exists(self):
+        if not self.client.bucket_exists(settings.MINIO_BUCKET_NAME):
+            self.client.make_bucket(settings.MINIO_BUCKET_NAME)
+
+    def upload_file(self, file_data: bytes, file_name: str, content_type: str, part_size: int = 15 * 1024 * 1024) -> str:
+        import tempfile
+        import os
+        
+        object_name = file_name
+
+        try:
+            # 创建临时文件
+            with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+                temp_file.write(file_data)
+                temp_file_path = temp_file.name
+
+            # 使用fput_object进行上传,内部已实现分片上传
+            self.client.fput_object(
+                bucket_name=settings.MINIO_BUCKET_NAME,
+                object_name=object_name,
+                file_path=temp_file_path,
+                content_type=content_type,
+                part_size=part_size  # 使用更大的分片大小,提高上传效率
+            )
+
+            return f"http://{settings.MINIO_ENDPOINT}/{settings.MINIO_BUCKET_NAME}/{object_name}"
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"文件上传失败: {str(e)}")
+        finally:
+            # 清理临时文件
+            try:
+                os.unlink(temp_file_path)
+            except:
+                pass
+
+    def download_file(self, object_name: str) -> bytes:
+        try:
+            response = self.client.get_object(settings.MINIO_BUCKET_NAME, object_name)
+            return response.read()
+        finally:
+            response.close()
+            response.release_conn()
+
+    def delete_file(self, object_name: str) -> bool:
+        try:
+            self.client.remove_object(settings.MINIO_BUCKET_NAME, object_name)
+            return True
+        except:
+            return False
+
+class FileUtils:
+    @staticmethod
+    def convert_office_file(input_path, output_dir, target_format):
+        """使用LibreOffice转换Office文件格式
+        
+        Args:
+            input_path (str): 输入文件路径
+            output_dir (str): 输出目录
+            target_format (str): 目标格式,如docx、pptx等
+            
+        Returns:
+            str: 转换后的文件路径,转换失败则返回None
+        """
+        # 检查输入文件是否存在
+        if not os.path.exists(input_path):
+            office_logger.error(f"输入文件不存在: {input_path}")
+            return None
+            
+        # 检查输出目录是否存在,不存在则创建
+        if not os.path.exists(output_dir):
+            try:
+                os.makedirs(output_dir)
+                office_logger.info(f"创建输出目录: {output_dir}")
+            except OSError as e:
+                office_logger.error(f"创建输出目录失败: {e}")
+                return None
+        
+        # 检查输出目录权限
+        if not os.access(output_dir, os.W_OK):
+            office_logger.error(f"输出目录没有写入权限: {output_dir}")
+            return None
+        
+        # 检查LibreOffice是否安装
+        libreoffice_cmd = "soffice"  # Linux/macOS
+        if os.name == 'nt':  # Windows
+            libreoffice_cmd = r"C:\Program Files\LibreOffice\program\soffice.exe"
+        
+        # 检查LibreOffice命令是否可用
+        try:
+            version_cmd = [libreoffice_cmd, "--version"]
+            version_result = subprocess.run(version_cmd, check=True, capture_output=True, text=True)
+            office_logger.info(f"LibreOffice版本: {version_result.stdout.strip()}")
+        except (subprocess.SubprocessError, FileNotFoundError) as e:
+            office_logger.error(f"LibreOffice未安装或不可用: {e}")
+            return None
+        
+        # 获取输入文件的文件名(不含路径和扩展名)
+        filename = os.path.basename(input_path)
+        base_name = os.path.splitext(filename)[0]
+        input_ext = os.path.splitext(filename)[1][1:].lower()
+        office_logger.info(f"原始文件名: {filename}, 基本名称: {base_name}, 扩展名: {input_ext}")
+        
+        # 如果输入文件扩展名与目标格式相同,直接复制文件
+        if input_ext == target_format.lower():
+            office_logger.info(f"输入文件已经是目标格式,直接复制文件")
+            final_output_path = os.path.join(output_dir, f"{base_name}.{target_format}")
+            try:
+                shutil.copy2(input_path, final_output_path)
+                office_logger.info(f"复制文件到最终位置: {final_output_path}")
+                return final_output_path
+            except (shutil.Error, IOError) as e:
+                office_logger.error(f"复制文件失败: {e}")
+                return None
+        
+        # 创建临时工作目录,避免中文路径问题
+        temp_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"temp_convert_{int(time.time())}")
+        try:
+            os.makedirs(temp_dir)
+            office_logger.info(f"创建临时工作目录: {temp_dir}")
+        except OSError as e:
+            office_logger.error(f"创建临时工作目录失败: {e}")
+            return None
+        
+        # 复制原文件到临时目录,使用英文文件名
+        temp_input_file = os.path.join(temp_dir, f"input.{input_ext}")
+        try:
+            shutil.copy2(input_path, temp_input_file)
+            office_logger.info(f"复制文件到临时目录: {temp_input_file}")
+        except (shutil.Error, IOError) as e:
+            office_logger.error(f"复制文件失败: {e}")
+            shutil.rmtree(temp_dir, ignore_errors=True)
+            return None
+        
+        # 记录转换前输出目录中的文件
+        before_files = set(os.listdir(temp_dir))
+        office_logger.debug(f"转换前临时目录内容: {before_files}")
+        
+        # 构建转换命令
+        cmd = [
+            libreoffice_cmd,
+            "--headless"
+        ]
+        
+        # 根据文件类型选择合适的转换参数
+        if input_ext == 'doc' and target_format.lower() == 'docx':
+            cmd.extend(["--convert-to", "docx:MS Word 2007 XML"])
+        elif input_ext == 'ppt' and target_format.lower() == 'pptx':
+            cmd.extend(["--convert-to", "pptx:Impress MS PowerPoint 2007 XML"])
+        else:
+            cmd.extend(["--convert-to", target_format])
+        
+        # 添加输出目录和输入文件
+        cmd.extend(["--outdir", temp_dir, temp_input_file])
+        
+        office_logger.info(f"开始转换文件: {temp_input_file} -> {target_format}")
+        office_logger.info(f"执行命令: {' '.join(cmd)}")
+        
+        # 切换到临时目录执行命令,避免路径问题
+        current_dir = os.getcwd()
+        os.chdir(temp_dir)
+        
+        try:
+            result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+            office_logger.info(f"转换命令输出: {result.stdout}")
+            if result.stderr:
+                office_logger.warning(f"转换命令错误输出: {result.stderr}")
+            
+            # 切回原目录
+            os.chdir(current_dir)
+            
+            # 等待一小段时间确保文件写入完成
+            time.sleep(1)
+            
+            # 记录转换后输出目录中的文件
+            after_files = set(os.listdir(temp_dir))
+            office_logger.debug(f"转换后临时目录内容: {after_files}")
+            
+            # 找出新增的文件
+            new_files = after_files - before_files
+            office_logger.info(f"新增文件: {new_files}")
+            
+            # 预期的输出文件名
+            expected_output_filename = f"input.{target_format}"
+            
+            # 预期的输出文件路径(在临时目录中)
+            expected_output_path = os.path.join(temp_dir, expected_output_filename)
+            
+            # 最终的输出文件路径(在目标目录中)
+            final_output_path = os.path.join(output_dir, f"{base_name}.{target_format}")
+            
+            # 检查预期的输出文件是否存在
+            if os.path.exists(expected_output_path):
+                # 复制转换后的文件到最终目标位置
+                try:
+                    shutil.copy2(expected_output_path, final_output_path)
+                    office_logger.info(f"复制转换后的文件到最终位置: {final_output_path}")
+                    # 清理临时目录
+                    shutil.rmtree(temp_dir, ignore_errors=True)
+                    return final_output_path
+                except (shutil.Error, IOError) as e:
+                    office_logger.error(f"复制转换后的文件失败: {e}")
+            elif new_files:
+                # 如果有新文件生成,使用第一个新文件
+                new_file_path = os.path.join(temp_dir, list(new_files)[0])
+                try:
+                    shutil.copy2(new_file_path, final_output_path)
+                    office_logger.info(f"复制新生成的文件到最终位置: {final_output_path}")
+                    # 清理临时目录
+                    shutil.rmtree(temp_dir, ignore_errors=True)
+                    return final_output_path
+                except (shutil.Error, IOError) as e:
+                    office_logger.error(f"复制新生成的文件失败: {e}")
+            else:
+                # 尝试在临时目录中查找匹配的文件
+                pattern = os.path.join(temp_dir, f"*.{target_format}")
+                matching_files = glob.glob(pattern)
+                office_logger.info(f"匹配的文件列表: {matching_files}")
+                
+                if matching_files:
+                    # 按修改时间排序,获取最新的文件
+                    newest_file = max(matching_files, key=os.path.getmtime)
+                    try:
+                        shutil.copy2(newest_file, final_output_path)
+                        office_logger.info(f"复制匹配的文件到最终位置: {final_output_path}")
+                        # 清理临时目录
+                        shutil.rmtree(temp_dir, ignore_errors=True)
+                        return final_output_path
+                    except (shutil.Error, IOError) as e:
+                        office_logger.error(f"复制匹配的文件失败: {e}")
+            
+            # 如果所有尝试都失败,清理临时目录并返回None
+            office_logger.error(f"转换后的文件不存在或无法复制")
+            shutil.rmtree(temp_dir, ignore_errors=True)
+            return None
+        except subprocess.CalledProcessError as e:
+            # 切回原目录
+            os.chdir(current_dir)
+            office_logger.error(f"转换失败: {e.stderr if hasattr(e, 'stderr') else str(e)}")
+            # 清理临时目录
+            shutil.rmtree(temp_dir, ignore_errors=True)
+            return None
+        except Exception as e:
+            # 切回原目录
+            os.chdir(current_dir)
+            office_logger.error(f"转换过程中发生未知错误: {str(e)}")
+            # 清理临时目录
+            shutil.rmtree(temp_dir, ignore_errors=True)
+            return None

+ 29 - 0
config/site.py

@@ -1,6 +1,8 @@
 import os
 from dotenv import load_dotenv
 from urllib.parse import quote
+from pydantic_settings import BaseSettings
+from typing import Dict, Union, Optional
 
 load_dotenv()
 
@@ -54,4 +56,31 @@ class SiteConfig:
         for item in config_list:
             if not self.get_config(item):
                 raise ValueError(f"Configuration '{item}' is not set.")
+
+
+class Settings(BaseSettings):
+    # PostgreSQL配置
+    DATABASE_URL: str = "postgresql://knowledge:qwer1234.@173.18.12.203:5432/knowledge_base"
+
+    # MinIO配置
+    MINIO_ENDPOINT: str = "173.18.12.199:9000"
+    MINIO_ACCESS_KEY: str = "yvhNcezRwQvPuUylHqrg"
+    MINIO_SECRET_KEY: str = "QQCejGeENpUIkGr4yfDaubPwWCoV29xHoXv6gHYU"
+    MINIO_BUCKET_NAME: str = "knowledge-base"
+    MINIO_SECURE: bool = False
+
+    # 文件上传配置
+    MAX_FILE_COUNT: int = 100
+    ALLOWED_EXTENSIONS: Dict[str, Dict[str, Union[int, Optional[int]]]] = {
+        "doc": {"max_size": 50, "max_pages": 1000},
+        "txt": {"max_size": 10, "max_pages": None},
+        "docx": {"max_size": 50, "max_pages": 1000},
+        "pdf": {"max_size": 500, "max_pages": 3000},
+        "ppt": {"max_size": 50, "max_pages": 1000},
+        "pptx": {"max_size": 50, "max_pages": 1000},
+        "md": {"max_size": 10, "max_pages": None}
+    }
+
+
+settings = Settings()
       

+ 492 - 0
openapi.yaml

@@ -332,6 +332,369 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/HTTPValidationError'
+  /knowledge-base/:
+    post:
+      tags:
+      - knowledge base interface
+      summary: Create Knowledge Base
+      operationId: create_knowledge_base_knowledge_base__post
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/KnowledgeBaseCreate'
+      responses:
+        '200':
+          description: Successful Response
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ResponseModel'
+        '422':
+          description: Validation Error
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPValidationError'
+    get:
+      tags:
+      - knowledge base interface
+      summary: List Knowledge Bases
+      operationId: list_knowledge_bases_knowledge_base__get
+      parameters:
+      - name: pageNo
+        in: query
+        required: false
+        schema:
+          type: integer
+          default: 1
+          title: Pageno
+      - name: pageSize
+        in: query
+        required: false
+        schema:
+          type: integer
+          default: 10
+          title: Pagesize
+      - name: name
+        in: query
+        required: false
+        schema:
+          anyOf:
+          - type: string
+          - type: 'null'
+          title: Name
+      responses:
+        '200':
+          description: Successful Response
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ResponseModel'
+        '422':
+          description: Validation Error
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPValidationError'
+  /knowledge-base/{kb_id}:
+    put:
+      tags:
+      - knowledge base interface
+      summary: Update Knowledge Base
+      operationId: update_knowledge_base_knowledge_base__kb_id__put
+      parameters:
+      - name: kb_id
+        in: path
+        required: true
+        schema:
+          type: integer
+          title: Kb Id
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/KnowledgeBaseUpdate'
+      responses:
+        '200':
+          description: Successful Response
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ResponseModel'
+        '422':
+          description: Validation Error
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPValidationError'
+    delete:
+      tags:
+      - knowledge base interface
+      summary: Delete Knowledge Base
+      operationId: delete_knowledge_base_knowledge_base__kb_id__delete
+      parameters:
+      - name: kb_id
+        in: path
+        required: true
+        schema:
+          type: integer
+          title: Kb Id
+      responses:
+        '200':
+          description: Successful Response
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ResponseModel'
+        '422':
+          description: Validation Error
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPValidationError'
+    get:
+      tags:
+      - knowledge base interface
+      summary: Get Knowledge Base
+      operationId: get_knowledge_base_knowledge_base__kb_id__get
+      parameters:
+      - name: kb_id
+        in: path
+        required: true
+        schema:
+          type: integer
+          title: Kb Id
+      responses:
+        '200':
+          description: Successful Response
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ResponseModel'
+        '422':
+          description: Validation Error
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPValidationError'
+  /knowledge-base/name/{name}:
+    get:
+      tags:
+      - knowledge base interface
+      summary: Get Knowledge Base By Name
+      operationId: get_knowledge_base_by_name_knowledge_base_name__name__get
+      parameters:
+      - name: name
+        in: path
+        required: true
+        schema:
+          type: string
+          title: Name
+      responses:
+        '200':
+          description: Successful Response
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ResponseModel'
+        '422':
+          description: Validation Error
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPValidationError'
+  /knowledge-base/{kb_id}/files/:
+    post:
+      tags:
+      - knowledge base interface
+      summary: Upload Files
+      operationId: upload_files_knowledge_base__kb_id__files__post
+      parameters:
+      - name: kb_id
+        in: path
+        required: true
+        schema:
+          type: integer
+          title: Kb Id
+      requestBody:
+        required: true
+        content:
+          multipart/form-data:
+            schema:
+              $ref: '#/components/schemas/Body_upload_files_knowledge_base__kb_id__files__post'
+      responses:
+        '200':
+          description: Successful Response
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ResponseModel'
+        '422':
+          description: Validation Error
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPValidationError'
+    get:
+      tags:
+      - knowledge base interface
+      summary: List Files
+      operationId: list_files_knowledge_base__kb_id__files__get
+      parameters:
+      - name: kb_id
+        in: path
+        required: true
+        schema:
+          type: integer
+          title: Kb Id
+      - name: pageNo
+        in: query
+        required: false
+        schema:
+          type: integer
+          default: 1
+          title: Pageno
+      - name: pageSize
+        in: query
+        required: false
+        schema:
+          type: integer
+          default: 10
+          title: Pagesize
+      - name: file_name
+        in: query
+        required: false
+        schema:
+          anyOf:
+          - type: string
+          - type: 'null'
+          title: File Name
+      responses:
+        '200':
+          description: Successful Response
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ResponseModel'
+        '422':
+          description: Validation Error
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPValidationError'
+  /knowledge-base/{kb_id}/files/search/:
+    get:
+      tags:
+      - knowledge base interface
+      summary: Search Files
+      operationId: search_files_knowledge_base__kb_id__files_search__get
+      parameters:
+      - name: kb_id
+        in: path
+        required: true
+        schema:
+          type: integer
+          title: Kb Id
+      - name: file_name
+        in: query
+        required: true
+        schema:
+          type: string
+          title: File Name
+      responses:
+        '200':
+          description: Successful Response
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ResponseModel'
+        '422':
+          description: Validation Error
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPValidationError'
+  /files/{file_id}/download:
+    get:
+      tags:
+      - knowledge base interface
+      summary: Download File
+      operationId: download_file_files__file_id__download_get
+      parameters:
+      - name: file_id
+        in: path
+        required: true
+        schema:
+          type: integer
+          title: File Id
+      responses:
+        '200':
+          description: Successful Response
+          content:
+            application/json:
+              schema: {}
+        '422':
+          description: Validation Error
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPValidationError'
+  /files/{file_id}:
+    delete:
+      tags:
+      - knowledge base interface
+      summary: Delete File
+      operationId: delete_file_files__file_id__delete
+      parameters:
+      - name: file_id
+        in: path
+        required: true
+        schema:
+          type: integer
+          title: File Id
+      responses:
+        '200':
+          description: Successful Response
+          content:
+            application/json:
+              schema:
+                type: object
+                additionalProperties: true
+                title: Response Delete File Files  File Id  Delete
+        '422':
+          description: Validation Error
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPValidationError'
+  /files/batch-update:
+    put:
+      tags:
+      - knowledge base interface
+      summary: Batch Update Files
+      operationId: batch_update_files_files_batch_update_put
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/BatchFileUpdate'
+        required: true
+      responses:
+        '200':
+          description: Successful Response
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ResponseModel'
+        '422':
+          description: Validation Error
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPValidationError'
 components:
   schemas:
     BasicRequest:
@@ -382,6 +745,17 @@ components:
       - name
       - value
       title: BasicRequestParameter
+    BatchFileUpdate:
+      properties:
+        files:
+          items:
+            $ref: '#/components/schemas/FileUpdate'
+          type: array
+          title: Files
+      type: object
+      required:
+      - files
+      title: BatchFileUpdate
     Body_upload_file_file_upload__file_type___job_id__post:
       properties:
         file:
@@ -392,6 +766,62 @@ components:
       required:
       - file
       title: Body_upload_file_file_upload__file_type___job_id__post
+    Body_upload_files_knowledge_base__kb_id__files__post:
+      properties:
+        files:
+          items:
+            type: string
+            format: binary
+          type: array
+          title: Files
+      type: object
+      required:
+      - files
+      title: Body_upload_files_knowledge_base__kb_id__files__post
+    FileUpdate:
+      properties:
+        id:
+          type: integer
+          title: Id
+        file_name:
+          anyOf:
+          - type: string
+          - type: 'null'
+          title: File Name
+        version:
+          anyOf:
+          - type: string
+          - type: 'null'
+          title: Version
+        author:
+          anyOf:
+          - type: string
+          - type: 'null'
+          title: Author
+        year:
+          anyOf:
+          - type: integer
+          - type: 'null'
+          title: Year
+        page_count:
+          anyOf:
+          - type: integer
+          - type: 'null'
+          title: Page Count
+        creator:
+          anyOf:
+          - type: string
+          - type: 'null'
+          title: Creator
+        knowledge_type:
+          anyOf:
+          - type: string
+          - type: 'null'
+          title: Knowledge Type
+      type: object
+      required:
+      - id
+      title: FileUpdate
     HTTPValidationError:
       properties:
         detail:
@@ -401,6 +831,67 @@ components:
           title: Detail
       type: object
       title: HTTPValidationError
+    KnowledgeBaseCreate:
+      properties:
+        name:
+          type: string
+          title: Name
+        description:
+          anyOf:
+          - type: string
+          - type: 'null'
+          title: Description
+        tags:
+          anyOf:
+          - type: string
+          - type: 'null'
+          title: Tags
+      type: object
+      required:
+      - name
+      title: KnowledgeBaseCreate
+    KnowledgeBaseUpdate:
+      properties:
+        name:
+          type: string
+          title: Name
+        description:
+          anyOf:
+          - type: string
+          - type: 'null'
+          title: Description
+        tags:
+          anyOf:
+          - type: string
+          - type: 'null'
+          title: Tags
+      type: object
+      required:
+      - name
+      title: KnowledgeBaseUpdate
+    ResponseModel:
+      properties:
+        code:
+          type: integer
+          title: Code
+        message:
+          type: string
+          title: Message
+        data:
+          anyOf:
+          - additionalProperties: true
+            type: object
+          - items: {}
+            type: array
+          - type: boolean
+          - type: 'null'
+          title: Data
+      type: object
+      required:
+      - code
+      - message
+      - data
+      title: ResponseModel
     StandardResponse:
       properties:
         code:
@@ -412,6 +903,7 @@ components:
           title: Message
           default: success
         meta:
+          additionalProperties: true
           type: object
           title: Meta
           default: {}