Files
FastGPT/deploy/helm/opensandbox/examples/pool-agent-production.yaml
T
Archer 567d408158 Action and opensandbox deploy (#6572)
* action

* action

* action

* build: integrate OpenSandbox as Agent Execution Sandbox (#6490)

* Update action (#6571)

* action

* action

* action

* action

* action

* build: integrate OpenSandbox as Agent Execution Sandbox

# Conflicts:
#	deploy/args.json
#	deploy/dev/docker-compose.cn.yml
#	deploy/dev/docker-compose.yml
#	deploy/docker/cn/docker-compose.milvus.yml
#	deploy/docker/cn/docker-compose.oceanbase.yml
#	deploy/docker/cn/docker-compose.pg.yml
#	deploy/docker/cn/docker-compose.seekdb.yml
#	deploy/docker/cn/docker-compose.zilliz.yml
#	deploy/docker/global/docker-compose.milvus.yml
#	deploy/docker/global/docker-compose.oceanbase.yml
#	deploy/docker/global/docker-compose.pg.yml
#	deploy/docker/global/docker-compose.seekdb.yml
#	deploy/docker/global/docker-compose.ziliiz.yml
#	deploy/templates/docker-compose.prod.yml
#	document/public/deploy/docker/cn/docker-compose.milvus.yml
#	document/public/deploy/docker/cn/docker-compose.oceanbase.yml
#	document/public/deploy/docker/cn/docker-compose.pg.yml
#	document/public/deploy/docker/cn/docker-compose.seekdb.yml
#	document/public/deploy/docker/cn/docker-compose.zilliz.yml
#	document/public/deploy/docker/global/docker-compose.milvus.yml
#	document/public/deploy/docker/global/docker-compose.oceanbase.yml
#	document/public/deploy/docker/global/docker-compose.pg.yml
#	document/public/deploy/docker/global/docker-compose.seekdb.yml
#	document/public/deploy/docker/global/docker-compose.ziliiz.yml

* remove invalid  action

---------

Co-authored-by: Archer <545436317@qq.com>
Co-authored-by: xqvvu <whoeverimf5@gmail.com>

* action

---------

Co-authored-by: chanzany <chenzhi@sangfor.com.cn>
Co-authored-by: xqvvu <whoeverimf5@gmail.com>
2026-03-16 20:46:26 +08:00

343 lines
12 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# ==============================================================================
# OpenSandbox Agent Pool - 生产级配置
# ==============================================================================
#
# 用途:为多 Agent 并发场景提供预热的沙箱资源池(SDK 模式)
#
# 注意:此配置适用于 SDK 场景(需要 execd)
# 如果只使用 kubectl 管理 BatchSandbox,可以移除 execd 相关配置
#
# 架构说明:
# Pool 维护预热的 Pod 池(不是 Sandbox 池)
# - 每个 Pod 包含 execdSDK 通信)+ task-executor(任务注入)
# - SDK.create() 从 Pool 分配 Pod,创建新的 BatchSandbox
# - SDK.kill() 删除 BatchSandboxPod 返回 Pool
#
# 使用模式:
# 1. Helm 部署时创建此 Pool(长期运行)
# 2. Agent 运行时通过 SDK 动态创建/删除 BatchSandbox
# 3. 不需要预创建 BatchSandbox
#
# 容量规划建议:
# | 并发 Agent 数 | bufferMin | bufferMax | poolMin | poolMax |
# |--------------|-----------|-----------|---------|---------|
# | 1-10 | 2 | 5 | 2 | 20 |
# | 10-50 | 10 | 20 | 10 | 100 |
# | 50-200 | 50 | 100 | 50 | 300 |
# | 200+ | 100 | 200 | 100 | 500 |
#
# 相关文档:
# - /data/home/cz/sandbox-test/pool-analysis/opensandbox_pool_usage_guide.md
# - /data/home/cz/OpenSandbox/kubernetes/helm-chart/values.yaml (查看 pools 配置)
#
# ==============================================================================
apiVersion: sandbox.opensandbox.io/v1alpha1
kind: Pool
metadata:
name: agent-pool
namespace: default # 使用与 Helm chart namespaceOverride 一致的命名空间
labels:
app: opensandbox
component: agent-pool
annotations:
description: "生产级 Agent Pool,支持 SDK 动态创建 sandbox"
spec:
template:
metadata:
labels:
pool: agent-pool
sdk-compatible: "true"
spec:
# ========================================
# 必需:共享进程命名空间
# ========================================
# task-executor 需要访问 sandbox 容器的进程树
shareProcessNamespace: true
# ========================================
# Init Container:安装 execd(仅 SDK 场景需要)
# ========================================
# 注意:如果只使用 kubectl 管理 BatchSandbox(不使用 SDK),
# 可以移除此 init container 和相关的 execd 配置。
#
# execd 的作用:
# - 提供 SDK 与 Pod 的通信接口(44772 端口)
# - 执行 SDK 发送的命令(commands.run(), files.read_file() 等)
#
# 使用场景:
# ✅ 需要:SDK 动态创建 sandboxAgent、Code Interpreter 等)
# ❌ 不需要:纯 kubectl 批量任务(RL 训练、压力测试等)
initContainers:
- name: execd-installer
image: opensandbox/execd:v1.0.5
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
# 复制 execd 二进制和启动脚本
cp ./execd /opt/opensandbox/bin/execd && \
cp ./bootstrap.sh /opt/opensandbox/bin/bootstrap.sh && \
chmod +x /opt/opensandbox/bin/execd && \
chmod +x /opt/opensandbox/bin/bootstrap.sh && \
echo "execd installed successfully"
volumeMounts:
- name: opensandbox-bin
mountPath: /opt/opensandbox/bin
# ========================================
# 主容器:Sandbox 环境
# ========================================
containers:
- name: sandbox-container
# 镜像说明:
# - Pool 中的镜像由 Pool 预定义,SDK 指定的镜像会被忽略
# - 根据 Agent 需求选择合适的基础镜像(nginx, ubuntu, python, etc.
# - 确保镜像包含 /bin/sh 用于执行 bootstrap.sh
image: nginx:latest
imagePullPolicy: IfNotPresent
# 启动命令:使用 bootstrap.sh 启动 execd
# bootstrap.sh 会:
# 1. 后台启动 execd(监听 44772 端口)
# 2. 执行用户指定的命令(这里是 sleep infinity
command: ["/opt/opensandbox/bin/bootstrap.sh", "sleep", "infinity"]
# 环境变量
env:
- name: EXECD
value: /opt/opensandbox/bin/execd
# 可添加其他环境变量:
# - name: CUSTOM_VAR
# value: "custom-value"
# 端口配置
ports:
- containerPort: 44772
name: execd
protocol: TCP
# 可暴露其他端口(如应用端口):
# - containerPort: 8080
# name: app
# protocol: TCP
# 资源配置
# 根据 Agent 任务复杂度调整
resources:
requests:
cpu: "100m" # 最小 CPU(保证调度)
memory: "128Mi" # 最小内存
limits:
cpu: "500m" # 最大 CPU(防止资源抢占)
memory: "256Mi" # 最大内存
# 卷挂载
volumeMounts:
- name: opensandbox-bin
mountPath: /opt/opensandbox/bin
readOnly: true
# 健康检查(可选)
# livenessProbe:
# tcpSocket:
# port: execd
# initialDelaySeconds: 10
# periodSeconds: 30
# readinessProbe:
# tcpSocket:
# port: execd
# initialDelaySeconds: 5
# periodSeconds: 10
# ========================================
# SidecarTask Executor
# ========================================
- name: task-executor
# 镜像说明:
# - 使用 Helm chart 中配置的 task-executor 镜像
# - 版本应与 controller 版本保持一致
# - 开发环境可用:opensandbox/task-executor:dev
# - 生产环境建议:your-registry/opensandbox-task-executor:v1.0.0
image: opensandbox/task-executor:dev
imagePullPolicy: Never # 开发环境使用本地镜像;生产环境改为 IfNotPresent
# 端口配置
ports:
- containerPort: 5758
name: task-executor
protocol: TCP
# 资源配置
# task-executor 需要更多 CPU 用于进程注入
resources:
requests:
cpu: "100m"
memory: "128Mi"
limits:
cpu: "500m" # 可能需要更多 CPU
memory: "256Mi"
# 安全上下文
# 必需:SYS_PTRACE 权限用于注入进程到 sandbox 容器
securityContext:
capabilities:
add: ["SYS_PTRACE"]
# 生产环境建议添加其他安全设置:
# runAsNonRoot: true
# runAsUser: 1000
# allowPrivilegeEscalation: false
# ========================================
# 卷配置
# ========================================
volumes:
- name: opensandbox-bin
emptyDir: {}
# 可添加其他卷(如配置文件、数据持久化):
# - name: config
# configMap:
# name: agent-config
# - name: data
# persistentVolumeClaim:
# claimName: agent-data-pvc
# ========================================
# 调度配置(可选)
# ========================================
# 节点选择器
# nodeSelector:
# workload-type: agent
# zone: production
# 容忍度(允许调度到特定污点的节点)
# tolerations:
# - key: "workload"
# operator: "Equal"
# value: "agent"
# effect: "NoSchedule"
# 亲和性(控制 Pod 分布)
# affinity:
# podAntiAffinity:
# preferredDuringSchedulingIgnoredDuringExecution:
# - weight: 100
# podAffinityTerm:
# labelSelector:
# matchExpressions:
# - key: pool
# operator: In
# values:
# - agent-pool
# topologyKey: kubernetes.io/hostname
# ========================================
# Pool 容量配置
# ========================================
capacitySpec:
# bufferMin: 最小缓冲 - Pool 保证至少有这么多可用 Pod
# - 保证快速响应,避免 Agent 等待
# - 根据并发 Agent 峰值设置
bufferMin: 10
# bufferMax: 最大缓冲 - Pool 最多预热这么多 Pod
# - 控制预热成本,避免资源浪费
# - 通常设为 bufferMin 的 2-5 倍
bufferMax: 50
# poolMin: Pool 最小容量 - 即使没人用,也保持这么多 Pod
# - 低峰期保底容量,避免冷启动
# - 通常与 bufferMin 相同或稍大
poolMin: 10
# poolMax: Pool 最大容量 - 高峰期最多这么多 Pod
# - 限制最大资源使用,避免集群资源耗尽
# - 根据集群资源和业务峰值设置
poolMax: 200
# 容量规划示例:
# 场景:200 个并发 Agent,平均每个 Agent 会话 5 分钟
# bufferMin: 50 (保证 50 个 Agent 立即可用)
# bufferMax: 100 (预热 100 个,覆盖短期突发)
# poolMin: 50 (低峰期保持 50 个)
# poolMax: 300 (高峰期最多 300 个)
# ==============================================================================
# SDK 使用示例
# ==============================================================================
#
# Python SDK 使用此 Pool
#
# ```python
# import asyncio
# from datetime import timedelta
# from opensandbox import Sandbox
# from opensandbox.config import ConnectionConfig
#
# async def create_agent_sandbox(agent_id: str):
# """为 Agent 创建 sandbox"""
# sandbox = await Sandbox.create(
# "nginx:latest", # 镜像会被忽略,使用 Pool 中的镜像
# entrypoint=["/bin/sh", "-c", "sleep infinity"], # 可自定义
# env={"AGENT_ID": agent_id}, # 可传递环境变量
# timeout=timedelta(hours=1),
# connection_config=ConnectionConfig(domain="<server-ip>:8088"),
# extensions={"poolRef": "agent-pool"} # 指定 Pool 名称
# )
# return sandbox
#
# async def handle_agent_request(agent_id: str, task: str):
# """处理单个 Agent 请求"""
# sandbox = await create_agent_sandbox(agent_id)
# try:
# result = await sandbox.commands.run(task)
# return result
# finally:
# await sandbox.kill() # Pod 返回 Pool
# ```
#
# ==============================================================================
# 部署和监控
# ==============================================================================
#
# 1. 部署 Pool
# kubectl apply -f pool-agent-production.yaml
#
# 2. 验证 Pool 状态:
# kubectl get pool agent-pool -n default
# kubectl get pool agent-pool -n default -o jsonpath='{.status}' | jq
#
# 3. 查看 Pool 的 Pod
# kubectl get pods -l pool=agent-pool -n default
#
# 4. 监控 Pool 使用率:
# watch kubectl get pool agent-pool -n default -o jsonpath='{.status}'
# # 输出示例:
# # {
# # "total": 50, # 总 Pod 数
# # "allocated": 30, # 已分配
# # "available": 20 # 可用
# # }
#
# 5. 优化建议:
# - 如果 available 经常为 0 → 增加 bufferMax
# - 如果 available 总是接近 total → 减少 bufferMin
# - 如果 total 经常达到 poolMax → 增加 poolMax 或优化 Agent 使用
#
# ==============================================================================
# 故障排查
# ==============================================================================
#
# Pool Pod 无法启动:
# kubectl describe pod -l pool=agent-pool -n default
# kubectl logs -l pool=agent-pool -n default -c sandbox-container
# kubectl logs -l pool=agent-pool -n default -c task-executor
#
# execd 连接失败:
# kubectl exec -it <pod-name> -n default -c sandbox-container -- ps aux | grep execd
# kubectl exec -it <pod-name> -n default -c sandbox-container -- nc -zv localhost 44772
#
# task-executor 权限问题:
# kubectl get pod <pod-name> -n default -o yaml | grep -A 10 securityContext
#
# ==============================================================================