Skip to content

第13章:Agent评估与优化

本章将讲解如何评估Agent的性能,以及如何系统化地优化Agent。

13.1 评估指标体系

为什么需要评估Agent?

开发Agent容易,但开发一个"好"的Agent很难。你需要回答:

  • Agent的表现如何?
  • 哪些地方需要改进?
  • 改进后是否真的变好了?

没有评估,就无法量化改进,也无法保证质量。

Agent评估的维度

1. 功能性指标

Agent能否正确完成任务?

  • 任务完成率:成功完成的任务比例
  • 准确率:输出结果的正确性
  • 召回率:是否找到了所有答案
  • F1分数:准确率和召回率的调和平均

2. 性能指标

Agent运行得有多快?

  • 响应时间:从请求到响应的时间
  • 吞吐量:单位时间处理的任务数
  • 资源消耗:CPU、内存、Token使用量
  • 并发能力:同时处理多少请求

3. 用户体验指标

用户用得爽不爽?

  • 用户满意度:用户评分
  • 任务时长:用户完成任务的时间
  • 重试率:用户需要重试的比例
  • 弃用率:用户放弃使用的比例

4. 成本指标

运行Agent需要多少钱?

  • API调用成本:LLM API费用
  • 计算成本:服务器费用
  • 存储成本:数据库、向量存储费用
  • 人力成本:维护和优化成本

评估指标的选择

不同场景关注不同指标:

应用场景关键指标次要指标
客服机器人用户满意度、准确率响应时间
数据分析Agent准确率、完成率成本
代码助手准确率、响应时间用户满意度
批处理任务完成率、吞吐量用户体验

任务完成率

python
from dataclasses import dataclass
from typing import List, Dict, Any
from enum import Enum

class TaskStatus(Enum):
    SUCCESS = "success"
    PARTIAL = "partial"
    FAILED = "failed"
    TIMEOUT = "timeout"

@dataclass
class TaskResult:
    task_id: str
    status: TaskStatus
    output: Any
    duration: float
    metadata: Dict = None

class TaskCompletionEvaluator:
    """任务完成率评估器"""
    
    def __init__(self):
        self.results: List[TaskResult] = []
    
    def record(self, result: TaskResult):
        """记录结果"""
        self.results.append(result)
    
    def completion_rate(self) -> float:
        """计算完成率"""
        if not self.results:
            return 0.0
        
        completed = sum(
            1 for r in self.results 
            if r.status in [TaskStatus.SUCCESS, TaskStatus.PARTIAL]
        )
        
        return completed / len(self.results)
    
    def success_rate(self) -> float:
        """计算成功率"""
        if not self.results:
            return 0.0
        
        success = sum(
            1 for r in self.results 
            if r.status == TaskStatus.SUCCESS
        )
        
        return success / len(self.results)
    
    def failure_analysis(self) -> Dict[str, int]:
        """失败分析"""
        analysis = {}
        
        for result in self.results:
            if result.status != TaskStatus.SUCCESS:
                status = result.status.value
                analysis[status] = analysis.get(status, 0) + 1
        
        return analysis

响应质量

python
from typing import List, Dict
from openai import OpenAI

class ResponseQualityEvaluator:
    """响应质量评估器"""
    
    def __init__(self, llm_client: OpenAI):
        self.llm = llm_client
    
    def evaluate_relevance(
        self,
        query: str,
        response: str
    ) -> float:
        """评估相关性"""
        prompt = f"""
请评估以下回答与问题的相关性(0-10分):

问题:{query}
回答:{response}

只输出数字分数。
"""
        
        response = self.llm.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        
        try:
            return float(response.choices[0].message.content.strip())
        except:
            return 5.0
    
    def evaluate_accuracy(
        self,
        query: str,
        response: str,
        ground_truth: str
    ) -> float:
        """评估准确性"""
        prompt = f"""
请评估以下回答是否准确(0-10分):

问题:{query}
回答:{response}
正确答案:{ground_truth}

只输出数字分数。
"""
        
        response = self.llm.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        
        try:
            return float(response.choices[0].message.content.strip())
        except:
            return 5.0
    
    def evaluate_clarity(self, response: str) -> float:
        """评估清晰度"""
        prompt = f"""
请评估以下回答的清晰度(0-10分):

回答:{response}

考虑因素:
- 是否易于理解
- 逻辑是否清晰
- 表达是否准确

只输出数字分数。
"""
        
        response = self.llm.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        
        try:
            return float(response.choices[0].message.content.strip())
        except:
            return 5.0
    
    def overall_quality(
        self,
        query: str,
        response: str,
        ground_truth: str = None
    ) -> Dict[str, float]:
        """综合质量评估"""
        relevance = self.evaluate_relevance(query, response)
        clarity = self.evaluate_clarity(response)
        
        metrics = {
            "relevance": relevance,
            "clarity": clarity
        }
        
        if ground_truth:
            accuracy = self.evaluate_accuracy(query, response, ground_truth)
            metrics["accuracy"] = accuracy
        
        metrics["overall"] = sum(metrics.values()) / len(metrics)
        
        return metrics

效率指标

python
from dataclasses import dataclass
from typing import List
import time

@dataclass
class PerformanceMetrics:
    avg_response_time: float
    p50_response_time: float
    p95_response_time: float
    p99_response_time: float
    throughput: float

class PerformanceMonitor:
    """性能监控"""
    
    def __init__(self):
        self.response_times: List[float] = []
        self.request_count = 0
        self.start_time = time.time()
    
    def record_response(self, duration: float):
        """记录响应时间"""
        self.response_times.append(duration)
        self.request_count += 1
    
    def calculate_metrics(self) -> PerformanceMetrics:
        """计算性能指标"""
        if not self.response_times:
            return PerformanceMetrics(0, 0, 0, 0, 0)
        
        sorted_times = sorted(self.response_times)
        n = len(sorted_times)
        
        avg = sum(sorted_times) / n
        p50 = sorted_times[int(n * 0.5)]
        p95 = sorted_times[int(n * 0.95)]
        p99 = sorted_times[int(n * 0.99)]
        
        elapsed = time.time() - self.start_time
        throughput = self.request_count / elapsed if elapsed > 0 else 0
        
        return PerformanceMetrics(
            avg_response_time=avg,
            p50_response_time=p50,
            p95_response_time=p95,
            p99_response_time=p99,
            throughput=throughput
        )

13.2 评估方法

如何评估Agent?

有了评估指标,还需要评估方法。不同的方法适用于不同的场景。

三种主要评估方法

1. 人工评估

由人来评估Agent的表现:

优点:

  • 准确性高:人能理解复杂情况
  • 灵活性强:可以评估主观质量
  • 发现细节问题:人眼能看到细节

缺点:

  • 成本高:需要人力
  • 速度慢:无法大规模评估
  • 主观性:不同人可能有不同判断

适用场景:

  • 产品发布前的质量检查
  • 用户体验评估
  • 复杂任务的评估

2. 自动评估

用程序自动评估Agent:

优点:

  • 速度快:可以大规模评估
  • 成本低:不需要人力
  • 可重复:结果一致

缺点:

  • 灵活性差:只能评估可量化的指标
  • 可能遗漏:无法评估主观质量
  • 需要标注:需要准备标准答案

适用场景:

  • 回归测试:确保改进不引入bug
  • 性能测试:评估响应时间等
  • 大规模评估:评估大量测试用例

3. AI辅助评估

用AI来评估AI:

优点:

  • 兼顾速度和质量
  • 可以评估主观质量
  • 成本适中

缺点:

  • 评估AI本身可能有偏见
  • 不如人工准确
  • 需要设计评估Prompt

适用场景:

  • 大规模质量评估
  • 实时监控
  • 辅助人工评估

评估流程

1. 定义评估目标

2. 准备测试数据

3. 运行Agent

4. 收集结果

5. 计算指标

6. 分析问题

7. 提出改进建议

人工评估

python
from dataclasses import dataclass
from typing import List, Dict
from enum import Enum

class Rating(Enum):
    EXCELLENT = 5
    GOOD = 4
    SATISFACTORY = 3
    POOR = 2
    VERY_POOR = 1

@dataclass
class HumanEvaluation:
    evaluator_id: str
    task_id: str
    response: str
    rating: Rating
    comments: str
    timestamp: float

class HumanEvaluator:
    """人工评估器"""
    
    def __init__(self):
        self.evaluations: List[HumanEvaluation] = []
        self.evaluators: Dict[str, str] = {}
    
    def register_evaluator(self, evaluator_id: str, name: str):
        """注册评估员"""
        self.evaluators[evaluator_id] = name
    
    def submit_evaluation(
        self,
        evaluator_id: str,
        task_id: str,
        response: str,
        rating: Rating,
        comments: str
    ):
        """提交评估"""
        evaluation = HumanEvaluation(
            evaluator_id=evaluator_id,
            task_id=task_id,
            response=response,
            rating=rating,
            comments=comments,
            timestamp=time.time()
        )
        
        self.evaluations.append(evaluation)
    
    def get_average_rating(self, task_id: str = None) -> float:
        """获取平均评分"""
        evaluations = self.evaluations
        if task_id:
            evaluations = [e for e in evaluations if e.task_id == task_id]
        
        if not evaluations:
            return 0.0
        
        return sum(e.rating.value for e in evaluations) / len(evaluations)
    
    def get_inter_rater_agreement(self) -> float:
        """计算评估员一致性"""
        # 简化实现
        from collections import defaultdict
        
        task_ratings = defaultdict(list)
        
        for eval in self.evaluations:
            task_ratings[eval.task_id].append(eval.rating.value)
        
        agreements = []
        
        for ratings in task_ratings.values():
            if len(ratings) > 1:
                avg = sum(ratings) / len(ratings)
                variance = sum((r - avg) ** 2 for r in ratings) / len(ratings)
                agreement = 1 / (1 + variance)
                agreements.append(agreement)
        
        return sum(agreements) / len(agreements) if agreements else 0.0

自动评估

python
from typing import List, Dict, Any
from dataclasses import dataclass

@dataclass
class TestCase:
    input: Any
    expected_output: Any
    metadata: Dict = None

@dataclass
class TestResult:
    test_case: TestCase
    actual_output: Any
    passed: bool
    score: float
    error: str = None

class AutoEvaluator:
    """自动评估器"""
    
    def __init__(self):
        self.test_cases: List[TestCase] = []
        self.results: List[TestResult] = []
    
    def add_test_case(self, test_case: TestCase):
        """添加测试用例"""
        self.test_cases.append(test_case)
    
    def run_tests(self, agent) -> List[TestResult]:
        """运行测试"""
        self.results = []
        
        for test_case in self.test_cases:
            try:
                actual_output = agent.process(test_case.input)
                
                passed = self._compare_outputs(
                    test_case.expected_output,
                    actual_output
                )
                
                score = self._calculate_score(
                    test_case.expected_output,
                    actual_output
                )
                
                result = TestResult(
                    test_case=test_case,
                    actual_output=actual_output,
                    passed=passed,
                    score=score
                )
                
            except Exception as e:
                result = TestResult(
                    test_case=test_case,
                    actual_output=None,
                    passed=False,
                    score=0.0,
                    error=str(e)
                )
            
            self.results.append(result)
        
        return self.results
    
    def _compare_outputs(self, expected: Any, actual: Any) -> bool:
        """比较输出"""
        if isinstance(expected, str) and isinstance(actual, str):
            return expected.lower() in actual.lower()
        return expected == actual
    
    def _calculate_score(self, expected: Any, actual: Any) -> float:
        """计算分数"""
        if self._compare_outputs(expected, actual):
            return 1.0
        
        # 简单的相似度计算
        if isinstance(expected, str) and isinstance(actual, str):
            expected_words = set(expected.lower().split())
            actual_words = set(actual.lower().split())
            
            if not expected_words:
                return 0.0
            
            overlap = len(expected_words & actual_words)
            return overlap / len(expected_words)
        
        return 0.0
    
    def get_statistics(self) -> Dict[str, Any]:
        """获取统计信息"""
        if not self.results:
            return {}
        
        passed = sum(1 for r in self.results if r.passed)
        total = len(self.results)
        avg_score = sum(r.score for r in self.results) / total
        
        return {
            "total_tests": total,
            "passed_tests": passed,
            "failed_tests": total - passed,
            "pass_rate": passed / total,
            "average_score": avg_score
        }

A/B测试

python
from typing import Dict, List, Any
from dataclasses import dataclass
from enum import Enum
import random

class Variant(Enum):
    A = "A"
    B = "B"

@dataclass
class ABTestResult:
    variant: Variant
    metric: str
    value: float
    metadata: Dict = None

class ABTestFramework:
    """A/B测试框架"""
    
    def __init__(self):
        self.assignments: Dict[str, Variant] = {}
        self.results: List[ABTestResult] = []
        self.ratio = 0.5
    
    def assign_variant(self, user_id: str) -> Variant:
        """分配变体"""
        if user_id in self.assignments:
            return self.assignments[user_id]
        
        variant = Variant.A if random.random() < self.ratio else Variant.B
        self.assignments[user_id] = variant
        
        return variant
    
    def record_result(
        self,
        user_id: str,
        metric: str,
        value: float,
        metadata: Dict = None
    ):
        """记录结果"""
        variant = self.assignments.get(user_id, Variant.A)
        
        result = ABTestResult(
            variant=variant,
            metric=metric,
            value=value,
            metadata=metadata
        )
        
        self.results.append(result)
    
    def analyze(self, metric: str) -> Dict[str, Any]:
        """分析结果"""
        a_results = [r for r in self.results if r.variant == Variant.A and r.metric == metric]
        b_results = [r for r in self.results if r.variant == Variant.B and r.metric == metric]
        
        if not a_results or not b_results:
            return {}
        
        a_avg = sum(r.value for r in a_results) / len(a_results)
        b_avg = sum(r.value for r in b_results) / len(b_results)
        
        improvement = (b_avg - a_avg) / a_avg if a_avg > 0 else 0
        
        return {
            "metric": metric,
            "variant_a": {
                "count": len(a_results),
                "average": a_avg
            },
            "variant_b": {
                "count": len(b_results),
                "average": b_avg
            },
            "improvement": improvement,
            "winner": Variant.B if improvement > 0 else Variant.A
        }

13.3 优化策略

如何优化Agent?

评估发现问题后,需要优化Agent。优化要有针对性,根据问题选择合适的方法。

Agent优化的主要方向

1. Prompt优化

Prompt是Agent的核心,优化Prompt往往能带来显著提升:

优化方法:

  • 明确指令:让Agent更清楚要做什么
  • 添加示例:用few-shot引导Agent
  • 调整结构:优化Prompt的组织方式
  • 减少歧义:避免模糊表述

效果:

  • 提高准确率
  • 减少错误
  • 改善输出质量

2. 工具优化

工具是Agent的能力来源:

优化方法:

  • 改进工具描述:让Agent更清楚工具用途
  • 优化参数设计:简化参数,提供默认值
  • 增强错误处理:提供友好的错误提示
  • 添加新工具:扩展Agent能力

效果:

  • 提高工具使用准确率
  • 减少工具调用错误
  • 扩展应用场景

3. 架构优化

Agent的整体架构影响性能:

优化方法:

  • 简化流程:减少不必要的步骤
  • 并行处理:独立任务并行执行
  • 缓存结果:避免重复计算
  • 懒加载:按需加载资源

效果:

  • 提高响应速度
  • 降低资源消耗
  • 提升用户体验

4. 模型优化

选择和使用合适的LLM:

优化方法:

  • 选择合适模型:平衡性能和成本
  • 调整温度参数:控制输出随机性
  • 使用流式输出:降低首字延迟
  • 批量处理:合并多个请求

效果:

  • 降低成本
  • 提高速度
  • 改善质量

优化的优先级

根据投入产出比确定优先级:

优化方向投入收益优先级
Prompt优化⭐⭐⭐⭐⭐
工具优化⭐⭐⭐⭐
架构优化⭐⭐⭐
模型优化⭐⭐⭐

优化流程

评估发现问题

分析问题原因

制定优化方案

实施优化

评估优化效果

效果达标? → 是 → 完成
   ↓ 否
调整方案,重新优化

Prompt优化

python
from typing import List, Dict, Any
from openai import OpenAI

class PromptOptimizer:
    """Prompt优化器"""
    
    def __init__(self, llm_client: OpenAI):
        self.llm = llm_client
        self.history = []
    
    def optimize(
        self,
        current_prompt: str,
        test_cases: List[Dict[str, Any]],
        max_iterations: int = 5
    ) -> str:
        """优化Prompt"""
        best_prompt = current_prompt
        best_score = self._evaluate_prompt(current_prompt, test_cases)
        
        for i in range(max_iterations):
            # 分析失败案例
            failures = self._analyze_failures(
                current_prompt,
                test_cases
            )
            
            # 生成改进建议
            improvements = self._generate_improvements(
                current_prompt,
                failures
            )
            
            # 应用改进
            new_prompt = self._apply_improvements(
                current_prompt,
                improvements
            )
            
            # 评估新Prompt
            new_score = self._evaluate_prompt(new_prompt, test_cases)
            
            if new_score > best_score:
                best_prompt = new_prompt
                best_score = new_score
                current_prompt = new_prompt
            
            self.history.append({
                "iteration": i + 1,
                "prompt": current_prompt,
                "score": new_score
            })
        
        return best_prompt
    
    def _evaluate_prompt(
        self,
        prompt: str,
        test_cases: List[Dict[str, Any]]
    ) -> float:
        """评估Prompt"""
        passed = 0
        
        for test_case in test_cases:
            response = self.llm.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "user", "content": prompt.format(**test_case["input"])}
                ]
            )
            
            output = response.choices[0].message.content
            
            if self._check_output(output, test_case["expected"]):
                passed += 1
        
        return passed / len(test_cases)
    
    def _check_output(self, output: str, expected: str) -> bool:
        """检查输出"""
        return expected.lower() in output.lower()
    
    def _analyze_failures(
        self,
        prompt: str,
        test_cases: List[Dict[str, Any]]
    ) -> List[Dict]:
        """分析失败案例"""
        failures = []
        
        for test_case in test_cases:
            response = self.llm.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "user", "content": prompt.format(**test_case["input"])}
                ]
            )
            
            output = response.choices[0].message.content
            
            if not self._check_output(output, test_case["expected"]):
                failures.append({
                    "input": test_case["input"],
                    "expected": test_case["expected"],
                    "actual": output
                })
        
        return failures
    
    def _generate_improvements(
        self,
        prompt: str,
        failures: List[Dict]
    ) -> List[str]:
        """生成改进建议"""
        prompt = f"""
当前Prompt:
{prompt}

失败案例:
{chr(10).join(f'输入:{f["input"]}\n期望:{f["expected"]}\n实际:{f["actual"][:100]}...' for f in failures[:3])}

请分析失败原因,并提供3个具体的改进建议。
"""
        
        response = self.llm.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}]
        )
        
        return response.choices[0].message.content.split('\n')
    
    def _apply_improvements(
        self,
        prompt: str,
        improvements: List[str]
    ) -> str:
        """应用改进"""
        return prompt + "\n\n" + "\n".join(improvements)

工具优化

python
from typing import List, Dict, Any
from dataclasses import dataclass

@dataclass
class ToolUsage:
    tool_name: str
    call_count: int
    success_count: int
    failure_count: int
    avg_duration: float

class ToolOptimizer:
    """工具优化器"""
    
    def __init__(self):
        self.usage: Dict[str, ToolUsage] = {}
    
    def record_usage(
        self,
        tool_name: str,
        success: bool,
        duration: float
    ):
        """记录工具使用"""
        if tool_name not in self.usage:
            self.usage[tool_name] = ToolUsage(
                tool_name=tool_name,
                call_count=0,
                success_count=0,
                failure_count=0,
                avg_duration=0.0
            )
        
        usage = self.usage[tool_name]
        usage.call_count += 1
        
        if success:
            usage.success_count += 1
        else:
            usage.failure_count += 1
        
        # 更新平均耗时
        n = usage.call_count
        usage.avg_duration = (
            (usage.avg_duration * (n - 1) + duration) / n
        )
    
    def get_optimization_suggestions(self) -> List[Dict[str, Any]]:
        """获取优化建议"""
        suggestions = []
        
        for tool_name, usage in self.usage.items():
            # 失败率高的工具
            failure_rate = usage.failure_count / usage.call_count if usage.call_count > 0 else 0
            if failure_rate > 0.3:
                suggestions.append({
                    "tool": tool_name,
                    "issue": "high_failure_rate",
                    "value": failure_rate,
                    "suggestion": "检查工具实现,添加错误处理"
                })
            
            # 耗时长的工具
            if usage.avg_duration > 5.0:
                suggestions.append({
                    "tool": tool_name,
                    "issue": "slow_response",
                    "value": usage.avg_duration,
                    "suggestion": "考虑缓存结果或优化实现"
                })
        
        return suggestions

架构优化

python
from typing import Dict, List, Any
from dataclasses import dataclass

@dataclass
class ArchitectureMetrics:
    component_name: str
    avg_response_time: float
    throughput: float
    error_rate: float
    resource_usage: Dict[str, float]

class ArchitectureOptimizer:
    """架构优化器"""
    
    def __init__(self):
        self.metrics: Dict[str, List[ArchitectureMetrics]] = {}
    
    def record_metrics(self, metrics: ArchitectureMetrics):
        """记录指标"""
        if metrics.component_name not in self.metrics:
            self.metrics[metrics.component_name] = []
        
        self.metrics[metrics.component_name].append(metrics)
    
    def analyze_bottlenecks(self) -> List[Dict[str, Any]]:
        """分析瓶颈"""
        bottlenecks = []
        
        for component, metrics_list in self.metrics.items():
            if not metrics_list:
                continue
            
            avg_time = sum(m.avg_response_time for m in metrics_list) / len(metrics_list)
            avg_error_rate = sum(m.error_rate for m in metrics_list) / len(metrics_list)
            
            # 响应时间慢的组件
            if avg_time > 3.0:
                bottlenecks.append({
                    "component": component,
                    "type": "slow_response",
                    "value": avg_time,
                    "suggestion": "考虑异步处理或缓存"
                })
            
            # 错误率高的组件
            if avg_error_rate > 0.1:
                bottlenecks.append({
                    "component": component,
                    "type": "high_error_rate",
                    "value": avg_error_rate,
                    "suggestion": "检查错误处理和重试机制"
                })
        
        return bottlenecks
    
    def suggest_scaling(self) -> Dict[str, Any]:
        """建议扩容"""
        suggestions = {}
        
        for component, metrics_list in self.metrics.items():
            if not metrics_list:
                continue
            
            avg_throughput = sum(m.throughput for m in metrics_list) / len(metrics_list)
            avg_resource = {
                "cpu": sum(m.resource_usage.get("cpu", 0) for m in metrics_list) / len(metrics_list),
                "memory": sum(m.resource_usage.get("memory", 0) for m in metrics_list) / len(metrics_list)
            }
            
            if avg_resource["cpu"] > 0.8:
                suggestions[component] = {
                    "action": "scale_up",
                    "reason": "high_cpu_usage",
                    "current": avg_resource["cpu"],
                    "suggested_instances": 2
                }
        
        return suggestions

13.4 【实战】构建评估系统

让我们构建一个完整的Agent评估系统。

项目结构

agent-evaluation/
├── .env
├── main.py
├── evaluators/
│   ├── auto.py
│   ├── human.py
│   └── quality.py
├── optimizer.py
└── requirements.txt

完整代码

evaluators/auto.py

python
from typing import List, Dict
from dataclasses import dataclass

@dataclass
class TestCase:
    input: str
    expected: str
    category: str

class AutoEvaluator:
    def __init__(self):
        self.test_cases: List[TestCase] = []
        self.results = []
    
    def add_test_case(self, test_case: TestCase):
        self.test_cases.append(test_case)
    
    def evaluate(self, agent) -> Dict:
        for test_case in self.test_cases:
            response = agent.process(test_case.input)
            
            self.results.append({
                "input": test_case.input,
                "expected": test_case.expected,
                "actual": response,
                "passed": test_case.expected.lower() in response.lower(),
                "category": test_case.category
            })
        
        return self.get_statistics()
    
    def get_statistics(self) -> Dict:
        total = len(self.results)
        passed = sum(1 for r in self.results if r["passed"])
        
        return {
            "total": total,
            "passed": passed,
            "failed": total - passed,
            "pass_rate": passed / total if total > 0 else 0
        }

optimizer.py

python
from evaluators.auto import AutoEvaluator
from openai import OpenAI

class AgentOptimizer:
    def __init__(self, agent, evaluator: AutoEvaluator):
        self.agent = agent
        self.evaluator = evaluator
        self.llm = OpenAI()
        self.optimization_history = []
    
    def optimize(self, max_iterations: int = 5):
        best_score = 0
        best_config = self.agent.get_config()
        
        for i in range(max_iterations):
            # 评估当前配置
            stats = self.evaluator.evaluate(self.agent)
            score = stats["pass_rate"]
            
            print(f"Iteration {i+1}: Score = {score:.2%}")
            
            if score > best_score:
                best_score = score
                best_config = self.agent.get_config()
            
            # 生成改进建议
            suggestions = self._generate_suggestions(stats)
            
            # 应用改进
            self._apply_suggestions(suggestions)
            
            self.optimization_history.append({
                "iteration": i + 1,
                "score": score,
                "config": self.agent.get_config()
            })
        
        # 恢复最佳配置
        self.agent.set_config(best_config)
        
        return best_score
    
    def _generate_suggestions(self, stats: Dict) -> List[str]:
        prompt = f"""
当前Agent性能:
- 通过率:{stats['pass_rate']:.2%}
- 失败案例:{stats['failed']}

请提供3个具体的改进建议。
"""
        
        response = self.llm.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}]
        )
        
        return response.choices[0].message.content.split('\n')
    
    def _apply_suggestions(self, suggestions: List[str]):
        # 应用改进建议
        pass

本章小结

本章我们学习了:

  • ✅ Agent评估指标体系
  • ✅ 人工评估、自动评估和A/B测试
  • ✅ Prompt、工具和架构优化策略
  • ✅ 构建了评估系统

下一章

下一章我们将学习Agent安全与伦理。

第14章:Agent安全与伦理 →

基于 MIT 许可发布