feat(feedback): track failure reasons for model learning

Test · Test · commit 98aa36c4143b · 2026-01-07T21:33:55.000-03:00
- Extended recordFeedback to include failureReason parameter
- Failure events now include LoopDetector stats in metadata
- This allows the model selector to learn from specific failure patterns
- Removed misleading cost estimate from observer (Groq is free/cheap)

Format of failure_reason:
  max_iterations_reached (Iterations: 25, Files Modified: 0, Files Read: 5, Tool Calls: 12)

This data can be used to:
- Identify models that struggle with certain task types
- Route complex tasks to more capable models
- Track iteration patterns that lead to failure
diff --git a/internal/maestro/conductor.go b/internal/maestro/conductor.go
@@ -221,7 +221,7 @@ func (c *Conductor) ExecuteTask(ctx context.Context, task string, complexity str
 
 		// Check if this is a query-only task (no validation needed)
 		if c.isQueryTask(plan, modifiedFiles) {
-			c.recordFeedback(editBackend, editModel, "editor", task, true)
+			c.recordFeedback(editBackend, editModel, "editor", task, true, "")
 
 			fmt.Printf("\n[OK] Task complete!\n")
 			if result != "" {
@@ -351,8 +351,8 @@ func (c *Conductor) ExecuteTask(ctx context.Context, task string, complexity str
 		}
 
 		// Success! Record positive feedback
-		c.recordFeedback(editBackend, editModel, "editor", task, true)
-		c.recordFeedback(reviewBackend, reviewModel, "reviewer", task, true)
+		c.recordFeedback(editBackend, editModel, "editor", task, true, "")
+		c.recordFeedback(reviewBackend, reviewModel, "reviewer", task, true, "")
 
 		fmt.Printf("\n[OK] Task complete!\n")
 		if result != "" {
@@ -374,8 +374,13 @@ func (c *Conductor) ExecuteTask(ctx context.Context, task string, complexity str
 	// Task failed after all attempts - record negative feedback
 	editBackend, editModel, _ := c.selector.SelectModel(config.ActionEdit, c.language, complexity)
 	reviewBackend, reviewModel, _ := c.selector.SelectModel(config.ActionReview, c.language, complexity)
-	c.recordFeedback(editBackend, editModel, "editor", task, false)
-	c.recordFeedback(reviewBackend, reviewModel, "reviewer", task, false)
+	// Capture failure reason from loop detector for learning
+	failureReason := "max_iterations_reached"
+	if c.loopDetector != nil {
+		failureReason = fmt.Sprintf("max_iterations_reached (%s)", c.loopDetector.GetStats())
+	}
+	c.recordFeedback(editBackend, editModel, "editor", task, false, failureReason)
+	c.recordFeedback(reviewBackend, reviewModel, "reviewer", task, false, failureReason)
 
 	// Record final failure metrics
 	if c.Tracer != nil {
@@ -392,7 +397,7 @@ func errorMsg(err error) string {
 	return err.Error()
 }
 
-func (c *Conductor) recordFeedback(backend, model, agent, task string, success bool) {
+func (c *Conductor) recordFeedback(backend, model, agent, task string, success bool, failureReason string) {
 	sentiment := feedback.SentimentBad
 	if success {
 		sentiment = feedback.SentimentGood
@@ -407,6 +412,13 @@ func (c *Conductor) recordFeedback(backend, model, agent, task string, success b
 		Context:   fmt.Sprintf("language=%s", c.language),
 	}
 
+	// Add failure reason to metadata so we can learn from specific failure types
+	if !success && failureReason != "" {
+		event.Metadata = map[string]string{
+			"failure_reason": failureReason,
+		}
+	}
+
 	if err := feedback.Record(event); err != nil {
 		if os.Getenv("GPTCODE_DEBUG") == "1" {
 			fmt.Fprintf(os.Stderr, "[WARN] Failed to record feedback: %v\n", err)
diff --git a/internal/observability/observer.go b/internal/observability/observer.go
@@ -452,13 +452,6 @@ func (o *AgentObserver) PrintSummary() {
 		fmt.Printf("  Tokens In:          %s\n", formatNumber(summary.TokensIn))
 		fmt.Printf("  Tokens Out:         %s\n", formatNumber(summary.TokensOut))
 		fmt.Printf("  Total Tokens:       %s\n", formatNumber(summary.TokensIn+summary.TokensOut))
-		
-		// Estimate cost (rough approximation)
-		if summary.TokensIn+summary.TokensOut > 0 {
-			// Assuming average cost of $0.001 per 1K tokens (varies by model)
-			estimatedCost := float64(summary.TokensIn+summary.TokensOut) / 1000.0 * 0.001
-			fmt.Printf("  Est. Cost:          $%.4f\n", estimatedCost)
-		}
 	} else {
 		fmt.Println("  No LLM calls recorded")
 	}