mirror of
https://github.com/labring/FastGPT.git
synced 2026-05-02 01:02:05 +08:00
567d408158
* action * action * action * build: integrate OpenSandbox as Agent Execution Sandbox (#6490) * Update action (#6571) * action * action * action * action * action * build: integrate OpenSandbox as Agent Execution Sandbox # Conflicts: # deploy/args.json # deploy/dev/docker-compose.cn.yml # deploy/dev/docker-compose.yml # deploy/docker/cn/docker-compose.milvus.yml # deploy/docker/cn/docker-compose.oceanbase.yml # deploy/docker/cn/docker-compose.pg.yml # deploy/docker/cn/docker-compose.seekdb.yml # deploy/docker/cn/docker-compose.zilliz.yml # deploy/docker/global/docker-compose.milvus.yml # deploy/docker/global/docker-compose.oceanbase.yml # deploy/docker/global/docker-compose.pg.yml # deploy/docker/global/docker-compose.seekdb.yml # deploy/docker/global/docker-compose.ziliiz.yml # deploy/templates/docker-compose.prod.yml # document/public/deploy/docker/cn/docker-compose.milvus.yml # document/public/deploy/docker/cn/docker-compose.oceanbase.yml # document/public/deploy/docker/cn/docker-compose.pg.yml # document/public/deploy/docker/cn/docker-compose.seekdb.yml # document/public/deploy/docker/cn/docker-compose.zilliz.yml # document/public/deploy/docker/global/docker-compose.milvus.yml # document/public/deploy/docker/global/docker-compose.oceanbase.yml # document/public/deploy/docker/global/docker-compose.pg.yml # document/public/deploy/docker/global/docker-compose.seekdb.yml # document/public/deploy/docker/global/docker-compose.ziliiz.yml * remove invalid action --------- Co-authored-by: Archer <545436317@qq.com> Co-authored-by: xqvvu <whoeverimf5@gmail.com> * action --------- Co-authored-by: chanzany <chenzhi@sangfor.com.cn> Co-authored-by: xqvvu <whoeverimf5@gmail.com>
343 lines
12 KiB
YAML
343 lines
12 KiB
YAML
# ==============================================================================
|
||
# OpenSandbox Agent Pool - 生产级配置
|
||
# ==============================================================================
|
||
#
|
||
# 用途:为多 Agent 并发场景提供预热的沙箱资源池(SDK 模式)
|
||
#
|
||
# 注意:此配置适用于 SDK 场景(需要 execd)
|
||
# 如果只使用 kubectl 管理 BatchSandbox,可以移除 execd 相关配置
|
||
#
|
||
# 架构说明:
|
||
# Pool 维护预热的 Pod 池(不是 Sandbox 池)
|
||
# - 每个 Pod 包含 execd(SDK 通信)+ task-executor(任务注入)
|
||
# - SDK.create() 从 Pool 分配 Pod,创建新的 BatchSandbox
|
||
# - SDK.kill() 删除 BatchSandbox,Pod 返回 Pool
|
||
#
|
||
# 使用模式:
|
||
# 1. Helm 部署时创建此 Pool(长期运行)
|
||
# 2. Agent 运行时通过 SDK 动态创建/删除 BatchSandbox
|
||
# 3. 不需要预创建 BatchSandbox
|
||
#
|
||
# 容量规划建议:
|
||
# | 并发 Agent 数 | bufferMin | bufferMax | poolMin | poolMax |
|
||
# |--------------|-----------|-----------|---------|---------|
|
||
# | 1-10 | 2 | 5 | 2 | 20 |
|
||
# | 10-50 | 10 | 20 | 10 | 100 |
|
||
# | 50-200 | 50 | 100 | 50 | 300 |
|
||
# | 200+ | 100 | 200 | 100 | 500 |
|
||
#
|
||
# 相关文档:
|
||
# - /data/home/cz/sandbox-test/pool-analysis/opensandbox_pool_usage_guide.md
|
||
# - /data/home/cz/OpenSandbox/kubernetes/helm-chart/values.yaml (查看 pools 配置)
|
||
#
|
||
# ==============================================================================
|
||
|
||
apiVersion: sandbox.opensandbox.io/v1alpha1
|
||
kind: Pool
|
||
metadata:
|
||
name: agent-pool
|
||
namespace: default # 使用与 Helm chart namespaceOverride 一致的命名空间
|
||
labels:
|
||
app: opensandbox
|
||
component: agent-pool
|
||
annotations:
|
||
description: "生产级 Agent Pool,支持 SDK 动态创建 sandbox"
|
||
spec:
|
||
template:
|
||
metadata:
|
||
labels:
|
||
pool: agent-pool
|
||
sdk-compatible: "true"
|
||
spec:
|
||
# ========================================
|
||
# 必需:共享进程命名空间
|
||
# ========================================
|
||
# task-executor 需要访问 sandbox 容器的进程树
|
||
shareProcessNamespace: true
|
||
|
||
# ========================================
|
||
# Init Container:安装 execd(仅 SDK 场景需要)
|
||
# ========================================
|
||
# 注意:如果只使用 kubectl 管理 BatchSandbox(不使用 SDK),
|
||
# 可以移除此 init container 和相关的 execd 配置。
|
||
#
|
||
# execd 的作用:
|
||
# - 提供 SDK 与 Pod 的通信接口(44772 端口)
|
||
# - 执行 SDK 发送的命令(commands.run(), files.read_file() 等)
|
||
#
|
||
# 使用场景:
|
||
# ✅ 需要:SDK 动态创建 sandbox(Agent、Code Interpreter 等)
|
||
# ❌ 不需要:纯 kubectl 批量任务(RL 训练、压力测试等)
|
||
initContainers:
|
||
- name: execd-installer
|
||
image: opensandbox/execd:v1.0.5
|
||
imagePullPolicy: IfNotPresent
|
||
command: ["/bin/sh", "-c"]
|
||
args:
|
||
- |
|
||
# 复制 execd 二进制和启动脚本
|
||
cp ./execd /opt/opensandbox/bin/execd && \
|
||
cp ./bootstrap.sh /opt/opensandbox/bin/bootstrap.sh && \
|
||
chmod +x /opt/opensandbox/bin/execd && \
|
||
chmod +x /opt/opensandbox/bin/bootstrap.sh && \
|
||
echo "execd installed successfully"
|
||
volumeMounts:
|
||
- name: opensandbox-bin
|
||
mountPath: /opt/opensandbox/bin
|
||
|
||
# ========================================
|
||
# 主容器:Sandbox 环境
|
||
# ========================================
|
||
containers:
|
||
- name: sandbox-container
|
||
# 镜像说明:
|
||
# - Pool 中的镜像由 Pool 预定义,SDK 指定的镜像会被忽略
|
||
# - 根据 Agent 需求选择合适的基础镜像(nginx, ubuntu, python, etc.)
|
||
# - 确保镜像包含 /bin/sh 用于执行 bootstrap.sh
|
||
image: nginx:latest
|
||
imagePullPolicy: IfNotPresent
|
||
|
||
# 启动命令:使用 bootstrap.sh 启动 execd
|
||
# bootstrap.sh 会:
|
||
# 1. 后台启动 execd(监听 44772 端口)
|
||
# 2. 执行用户指定的命令(这里是 sleep infinity)
|
||
command: ["/opt/opensandbox/bin/bootstrap.sh", "sleep", "infinity"]
|
||
|
||
# 环境变量
|
||
env:
|
||
- name: EXECD
|
||
value: /opt/opensandbox/bin/execd
|
||
# 可添加其他环境变量:
|
||
# - name: CUSTOM_VAR
|
||
# value: "custom-value"
|
||
|
||
# 端口配置
|
||
ports:
|
||
- containerPort: 44772
|
||
name: execd
|
||
protocol: TCP
|
||
# 可暴露其他端口(如应用端口):
|
||
# - containerPort: 8080
|
||
# name: app
|
||
# protocol: TCP
|
||
|
||
# 资源配置
|
||
# 根据 Agent 任务复杂度调整
|
||
resources:
|
||
requests:
|
||
cpu: "100m" # 最小 CPU(保证调度)
|
||
memory: "128Mi" # 最小内存
|
||
limits:
|
||
cpu: "500m" # 最大 CPU(防止资源抢占)
|
||
memory: "256Mi" # 最大内存
|
||
|
||
# 卷挂载
|
||
volumeMounts:
|
||
- name: opensandbox-bin
|
||
mountPath: /opt/opensandbox/bin
|
||
readOnly: true
|
||
|
||
# 健康检查(可选)
|
||
# livenessProbe:
|
||
# tcpSocket:
|
||
# port: execd
|
||
# initialDelaySeconds: 10
|
||
# periodSeconds: 30
|
||
# readinessProbe:
|
||
# tcpSocket:
|
||
# port: execd
|
||
# initialDelaySeconds: 5
|
||
# periodSeconds: 10
|
||
|
||
# ========================================
|
||
# Sidecar:Task Executor
|
||
# ========================================
|
||
- name: task-executor
|
||
# 镜像说明:
|
||
# - 使用 Helm chart 中配置的 task-executor 镜像
|
||
# - 版本应与 controller 版本保持一致
|
||
# - 开发环境可用:opensandbox/task-executor:dev
|
||
# - 生产环境建议:your-registry/opensandbox-task-executor:v1.0.0
|
||
image: opensandbox/task-executor:dev
|
||
imagePullPolicy: Never # 开发环境使用本地镜像;生产环境改为 IfNotPresent
|
||
|
||
# 端口配置
|
||
ports:
|
||
- containerPort: 5758
|
||
name: task-executor
|
||
protocol: TCP
|
||
|
||
# 资源配置
|
||
# task-executor 需要更多 CPU 用于进程注入
|
||
resources:
|
||
requests:
|
||
cpu: "100m"
|
||
memory: "128Mi"
|
||
limits:
|
||
cpu: "500m" # 可能需要更多 CPU
|
||
memory: "256Mi"
|
||
|
||
# 安全上下文
|
||
# 必需:SYS_PTRACE 权限用于注入进程到 sandbox 容器
|
||
securityContext:
|
||
capabilities:
|
||
add: ["SYS_PTRACE"]
|
||
# 生产环境建议添加其他安全设置:
|
||
# runAsNonRoot: true
|
||
# runAsUser: 1000
|
||
# allowPrivilegeEscalation: false
|
||
|
||
# ========================================
|
||
# 卷配置
|
||
# ========================================
|
||
volumes:
|
||
- name: opensandbox-bin
|
||
emptyDir: {}
|
||
# 可添加其他卷(如配置文件、数据持久化):
|
||
# - name: config
|
||
# configMap:
|
||
# name: agent-config
|
||
# - name: data
|
||
# persistentVolumeClaim:
|
||
# claimName: agent-data-pvc
|
||
|
||
# ========================================
|
||
# 调度配置(可选)
|
||
# ========================================
|
||
# 节点选择器
|
||
# nodeSelector:
|
||
# workload-type: agent
|
||
# zone: production
|
||
|
||
# 容忍度(允许调度到特定污点的节点)
|
||
# tolerations:
|
||
# - key: "workload"
|
||
# operator: "Equal"
|
||
# value: "agent"
|
||
# effect: "NoSchedule"
|
||
|
||
# 亲和性(控制 Pod 分布)
|
||
# affinity:
|
||
# podAntiAffinity:
|
||
# preferredDuringSchedulingIgnoredDuringExecution:
|
||
# - weight: 100
|
||
# podAffinityTerm:
|
||
# labelSelector:
|
||
# matchExpressions:
|
||
# - key: pool
|
||
# operator: In
|
||
# values:
|
||
# - agent-pool
|
||
# topologyKey: kubernetes.io/hostname
|
||
|
||
# ========================================
|
||
# Pool 容量配置
|
||
# ========================================
|
||
capacitySpec:
|
||
# bufferMin: 最小缓冲 - Pool 保证至少有这么多可用 Pod
|
||
# - 保证快速响应,避免 Agent 等待
|
||
# - 根据并发 Agent 峰值设置
|
||
bufferMin: 10
|
||
|
||
# bufferMax: 最大缓冲 - Pool 最多预热这么多 Pod
|
||
# - 控制预热成本,避免资源浪费
|
||
# - 通常设为 bufferMin 的 2-5 倍
|
||
bufferMax: 50
|
||
|
||
# poolMin: Pool 最小容量 - 即使没人用,也保持这么多 Pod
|
||
# - 低峰期保底容量,避免冷启动
|
||
# - 通常与 bufferMin 相同或稍大
|
||
poolMin: 10
|
||
|
||
# poolMax: Pool 最大容量 - 高峰期最多这么多 Pod
|
||
# - 限制最大资源使用,避免集群资源耗尽
|
||
# - 根据集群资源和业务峰值设置
|
||
poolMax: 200
|
||
|
||
# 容量规划示例:
|
||
# 场景:200 个并发 Agent,平均每个 Agent 会话 5 分钟
|
||
# bufferMin: 50 (保证 50 个 Agent 立即可用)
|
||
# bufferMax: 100 (预热 100 个,覆盖短期突发)
|
||
# poolMin: 50 (低峰期保持 50 个)
|
||
# poolMax: 300 (高峰期最多 300 个)
|
||
|
||
# ==============================================================================
|
||
# SDK 使用示例
|
||
# ==============================================================================
|
||
#
|
||
# Python SDK 使用此 Pool:
|
||
#
|
||
# ```python
|
||
# import asyncio
|
||
# from datetime import timedelta
|
||
# from opensandbox import Sandbox
|
||
# from opensandbox.config import ConnectionConfig
|
||
#
|
||
# async def create_agent_sandbox(agent_id: str):
|
||
# """为 Agent 创建 sandbox"""
|
||
# sandbox = await Sandbox.create(
|
||
# "nginx:latest", # 镜像会被忽略,使用 Pool 中的镜像
|
||
# entrypoint=["/bin/sh", "-c", "sleep infinity"], # 可自定义
|
||
# env={"AGENT_ID": agent_id}, # 可传递环境变量
|
||
# timeout=timedelta(hours=1),
|
||
# connection_config=ConnectionConfig(domain="<server-ip>:8088"),
|
||
# extensions={"poolRef": "agent-pool"} # 指定 Pool 名称
|
||
# )
|
||
# return sandbox
|
||
#
|
||
# async def handle_agent_request(agent_id: str, task: str):
|
||
# """处理单个 Agent 请求"""
|
||
# sandbox = await create_agent_sandbox(agent_id)
|
||
# try:
|
||
# result = await sandbox.commands.run(task)
|
||
# return result
|
||
# finally:
|
||
# await sandbox.kill() # Pod 返回 Pool
|
||
# ```
|
||
#
|
||
# ==============================================================================
|
||
# 部署和监控
|
||
# ==============================================================================
|
||
#
|
||
# 1. 部署 Pool:
|
||
# kubectl apply -f pool-agent-production.yaml
|
||
#
|
||
# 2. 验证 Pool 状态:
|
||
# kubectl get pool agent-pool -n default
|
||
# kubectl get pool agent-pool -n default -o jsonpath='{.status}' | jq
|
||
#
|
||
# 3. 查看 Pool 的 Pod:
|
||
# kubectl get pods -l pool=agent-pool -n default
|
||
#
|
||
# 4. 监控 Pool 使用率:
|
||
# watch kubectl get pool agent-pool -n default -o jsonpath='{.status}'
|
||
# # 输出示例:
|
||
# # {
|
||
# # "total": 50, # 总 Pod 数
|
||
# # "allocated": 30, # 已分配
|
||
# # "available": 20 # 可用
|
||
# # }
|
||
#
|
||
# 5. 优化建议:
|
||
# - 如果 available 经常为 0 → 增加 bufferMax
|
||
# - 如果 available 总是接近 total → 减少 bufferMin
|
||
# - 如果 total 经常达到 poolMax → 增加 poolMax 或优化 Agent 使用
|
||
#
|
||
# ==============================================================================
|
||
# 故障排查
|
||
# ==============================================================================
|
||
#
|
||
# Pool Pod 无法启动:
|
||
# kubectl describe pod -l pool=agent-pool -n default
|
||
# kubectl logs -l pool=agent-pool -n default -c sandbox-container
|
||
# kubectl logs -l pool=agent-pool -n default -c task-executor
|
||
#
|
||
# execd 连接失败:
|
||
# kubectl exec -it <pod-name> -n default -c sandbox-container -- ps aux | grep execd
|
||
# kubectl exec -it <pod-name> -n default -c sandbox-container -- nc -zv localhost 44772
|
||
#
|
||
# task-executor 权限问题:
|
||
# kubectl get pod <pod-name> -n default -o yaml | grep -A 10 securityContext
|
||
#
|
||
# ==============================================================================
|