Azure · chunyu3 · Feb 3, 2026 · Dec 18, 2025 · Dec 18, 2025 · Jan 21, 2026
@@ -1,3 +1,4 @@
+from enum import IntFlag, Enum
 import json
 import logging
 import math
@@ -7,10 +8,45 @@
 from tabulate import tabulate
 
 
+class VerificationResult(Enum):
+    PASS = "pass"
+    FAIL = "fail"
+    PASS_WITH_WARNING = "Pass with Warning"
+
+class EvalReturnCode(IntFlag):
+    SUCCESS = 0
+    FAIL = 1
+    WARNING = 2 << 1
+
 class EvalsResult:
-    def __init__(self, weights: dict[str, float] | None, metrics: dict[str, list[str] | None]):
+    def __init__(self, weights: dict[str, float] | None, metrics: dict[str, list[str] | None], suppressions: dict[str, list[str]] | None):
+        """
+        Initialize an EvalsResult instance for managing evaluation results.
+
+        Args:
+            weights: A dictionary mapping metric names to their weights for calculating
+                     overall scores. Keys should be in the format "{metric}_weight".
+                     If None, an empty dictionary is used and metrics are weighted equally.
+            metrics: A dictionary mapping evaluator names to their output field names.
+                     Each key is an evaluator name, and the value is a list of field names
+                     that the evaluator outputs, or None if only the default field is used.
+            suppressions: A dictionary specifying which evaluators and test cases should be
+                          suppressed (excluded from failure counting). Expected keys are:
+                          - "evaluators": List of evaluator names to suppress.
+                          - "testcases": List of test case names to suppress.
+                          If None, defaults to empty lists for both.
+        """
         self._weights = weights or {}
         self._metrics = metrics
+        # Ensure suppressions dict always has lists, never None
+        self._suppressions: dict[str, list[str]]
+        if suppressions is None:
+            self._suppressions = {"evaluators": [], "testcases": []}
+        else:
+            self._suppressions = {
+                "evaluators": suppressions.get("evaluators") or [],
+                "testcases": suppressions.get("testcases") or []
+            }
 
     def calculate_overall_score(self, row: dict[str, Any]) -> float:
         """Calculate weighted score based on various metrics."""
@@ -20,11 +56,12 @@ def calculate_overall_score(self, row: dict[str, Any]) -> float:
         for metric in metrics:
             metric_key = f"outputs.{metric}.{metric}"
             if metric_key not in row:
-                return 0.0
+                logging.info(f"calculate_overall_score key:{metric_key} does not exits")
+                overall_score += 0.0
             else:
                 score = float(row[metric_key])
                 if math.isnan(score):
-                    return 0.0
+                    overall_score += 0.0
                 if f"{metric}_weight" in self._weights:
                     overall_score += score * self._weights[f"{metric}_weight"]
                 else:
@@ -52,9 +89,14 @@ def record_run_result(self, result: dict[str, Any]) -> list[dict[str, Any]]:
             row_result["testcase"] = row["inputs.testcase"]
             row_result["expected"] = {
                 "answer": row["inputs.ground_truth"],
-                "reference_urls": row["inputs.expected_reference_urls"],
+                "references": row["inputs.expected_references"],
+                "knowledges": row["inputs.expected_knowledges"],
+            }
+            row_result["actual"] = {
+                "answer": row["inputs.response"],
+                "references": row["inputs.references"],
+                "knowledges": row["inputs.knowledges"],
             }
-            row_result["actual"] = {"answer": row["inputs.response"], "reference_urls": row["inputs.reference_urls"]}
             pattern = r"^outputs\.(\w+)\.(\w+)$"
             for index, (key, value) in enumerate(row.items()):
                 match = re.match(pattern, key)
@@ -64,7 +106,10 @@ def record_run_result(self, result: dict[str, Any]) -> list[dict[str, Any]]:
                     logging.debug(f"Metric: {metric}, Name: {metric_name}")
                     if key == f"outputs.{metric}.{metric}_result":
                         if value == "fail":
-                            fail_rates[metric] += 1
+                            if row["inputs.testcase"] not in self._suppressions["testcases"]:
+                                fail_rates[metric] += 1
+                            else:
+                                pass_rates[metric] += 1 #record to pass rate if the testcase is suppressed.
                         if value == "pass":
                             pass_rates[metric] += 1
 
@@ -90,7 +135,7 @@ def record_run_result(self, result: dict[str, Any]) -> list[dict[str, Any]]:
         return run_result
 
     @classmethod
-    def format_terminal_diff(cls, new: float, old: float, format_str: str = ".1f", reverse: bool = False) -> str:
+    def format_terminal_diff(cls, new: float, old: float, format_str: str = ".2f", reverse: bool = False) -> str:
         """Format difference with ANSI colors for terminal output."""
 
         diff = new - old
@@ -139,10 +184,10 @@ def build_output_table(
                     base_score = base[f"{metric}"] if f"{metric}" in base else None
                     if base_score is not None:
                         values.append(
-                            f"{metric_score:.1f}{EvalsResult.format_terminal_diff(metric_score, float(base_score))}"
+                            f"{metric_score:.2f}{EvalsResult.format_terminal_diff(metric_score, float(base_score))}"
                         )
                     else:
-                        values.append(f"{metric_score:.1f}")
+                        values.append(f"{metric_score:.2f}")
                     fields = self._metrics[metric]
                     if fields:
                         for field in fields:
@@ -152,11 +197,11 @@ def build_output_table(
                     else:
                         metric_result = result[f"{metric}_result"] if f"{metric}_result" in result else "N/A"
                         values.append(f"{metric_result}")
-                values.append(f"{score:.1f}{EvalsResult.format_terminal_diff(score, float(base['overall_score']))}")
+                values.append(f"{score:.2f}{EvalsResult.format_terminal_diff(score, float(base['overall_score']))}")
             else:
                 for metric in metrics:
                     metric_score = result[f"{metric}"] if f"{metric}" in result else -1
-                    values.append(f"{metric_score:.1f}")
+                    values.append(f"{metric_score:.2f}")
                     fields = self._metrics[metric]
                     if fields:
                         for field in fields:
@@ -166,7 +211,7 @@ def build_output_table(
                     else:
                         metric_result = result[f"{metric}_result"] if f"{metric}_result" in result else "N/A"
                         values.append(f"{metric_result}")
-                values.append(f"{score:.1f}")
+                values.append(f"{score:.2f}")
 
             terminal_row.extend(values)
             terminal_rows.append(terminal_row)
@@ -213,12 +258,23 @@ def show_results(self, all_results: dict[str, Any], with_baseline: bool = True)
 
             self.output_table(test_results, name, baseline_results)
 
-    def verify_results(self, all_results: dict[str, Any], with_baseline: bool = True) -> bool:
-        ret = True
+    def verify_results(self, all_results: dict[str, Any], with_baseline: bool = True) -> VerificationResult:
+        """
+        Verify evaluation results against baseline and suppression rules.
+
+        Args:
+            all_results: A dictionary mapping scenario names to their evaluation results (list of dicts).
+            with_baseline: Whether to compare results with baseline data (default: True).
+
+        Returns:
+            int: 0 if all scenarios pass, 1 if any scenario fails, 2 if all pass but some have warnings (suppressed failures).
+        """
+        ret = VerificationResult.PASS
         failed_scenarios = []
+        warning_scenarios = []
         metrics = self._metrics.keys()
         for name, test_results in all_results.items():
-            scenario_ret = True
+            scenario_ret = EvalReturnCode.SUCCESS
 
             if with_baseline:
                 baseline_results = {}
@@ -232,53 +288,83 @@ def verify_results(self, all_results: dict[str, Any], with_baseline: bool = True
                             baseline_results[result["testcase"]] = result
                         baseline_results["average_score"] = baseline_data[-1]["average_score"]
                         if test_results[-1]["average_score"] < baseline_data[-1]["average_score"]:
-                            # scenario_ret = False //ignore decrease in average score
                             logging.warning(f"scenario {name} avarage score decrease!")
+                            warning = True
 
             for metric in metrics:
                 pass_rate = test_results[-1][f"{metric}_pass_rate"] if f"{metric}_pass_rate" in test_results[-1] else 0
                 fail_rate = test_results[-1][f"{metric}_fail_rate"] if f"{metric}_fail_rate" in test_results[-1] else 0
                 # workaround: for groundedness, only caculate the `fail`
+                metric_ret = EvalReturnCode.SUCCESS
                 if metric == "groundedness":
                     if fail_rate > 0:
-                        scenario_ret = False
+                        metric_ret = EvalReturnCode.FAIL
                 else:
                     if pass_rate < test_results[-1]["total_evals"]:
-                        scenario_ret = False
+                        metric_ret = EvalReturnCode.FAIL
+                if metric in self._suppressions["evaluators"] and metric_ret == EvalReturnCode.FAIL:
+                    metric_ret = EvalReturnCode.WARNING #fallback to warning if evaluator is suppressed
+                scenario_ret = scenario_ret | metric_ret
 
-            if not scenario_ret:
+            if scenario_ret & EvalReturnCode.FAIL:
                 failed_scenarios.append(name)
-                ret = False
+                ret = VerificationResult.FAIL # failed
+            elif scenario_ret & EvalReturnCode.WARNING:
+                warning_scenarios.append(name)
+                ret = VerificationResult.PASS_WITH_WARNING # succeed with warning
+
         if failed_scenarios:
             logging.info(f"Failed Scenarios: {' '.join(failed_scenarios)}")
+        elif warning_scenarios:
+            logging.info(f"Scenarios with warnings: {' '.join(warning_scenarios)}")
         else:
             logging.info(f"All scenarios passed without issues.")
-        
+
         return ret
 
     def establish_baseline(self, all_results: dict[str, Any], is_ci: bool) -> None:
         """Establish the current results as the new baseline."""
 
+        def filter_keys_recursive(obj: Any, exclude_keys: list[str]) -> Any:
+            """Recursively filter out keys from nested dictionaries."""
+            if isinstance(obj, dict):
+                return {k: filter_keys_recursive(v, exclude_keys) 
+                        for k, v in obj.items() if k not in exclude_keys}
+            elif isinstance(obj, list):
+                return [filter_keys_recursive(item, exclude_keys) for item in obj]
+            return obj
+
         # only ask if we're not in CI
+        exclude_keys = ["knowledge_match_exact_matches",
+                        "knowledge_match_unexpected_refs",
+                        "knowledge_match_missing_refs",
+                        "reference_match_exact_matches",
+                        "reference_match_unexpected_refs",
+                        "reference_match_missing_refs",
+                        "knowledges"]
         if is_ci is False:
             establish_baseline = input("\nDo you want to establish this as the new baseline? (y/n): ")
             if establish_baseline.lower() == "y":
                 for name, result in all_results.items():
+                    # result is a list of dicts, filter each dict recursively
+                    partial_result = filter_keys_recursive(result, exclude_keys)
                     baseline_name = f"{name.split('_')[0]}-test.json"
                     baseline_path = pathlib.Path(__file__).parent / "results" / baseline_name
                     with open(str(baseline_path), "w") as f:
-                        json.dump(result, indent=4, fp=f)
+                        json.dump(partial_result, indent=4, fp=f)
 
         # whether or not we establish a baseline, we want to write results to a temp dir
         log_path = pathlib.Path(__file__).parent / "results" / ".log"
         if not log_path.exists():
             log_path.mkdir(parents=True, exist_ok=True)
 
         for name, result in all_results.items():
+            # result is a list of dicts, filter each dict recursively
+            partial_result = filter_keys_recursive(result, exclude_keys)
             baseline_name = f"{name.split('_')[0]}-test.json"
             output_path = log_path / baseline_name
             with open(str(output_path), "w") as f:
-                json.dump(result, indent=4, fp=f)
+                json.dump(partial_result, indent=4, fp=f)
 
 
-__all__ = ["EvalsResult"]
+__all__ = ["EvalsResult", "VerificationResult"]
@@ -17,8 +17,7 @@
 import aiohttp
 import yaml
 
-
-def extract_links_from_references(references: List[Dict[str, Any]]) -> List[str]:
+def extract_title_and_link_from_references(references: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     """
     Map an array of reference objects to a string array of their 'link' properties.
 
@@ -31,15 +30,48 @@ def extract_links_from_references(references: List[Dict[str, Any]]) -> List[str]
     if not references:
         return []
 
-    links = []
+    refs = []
     for ref in references:
+        title: str = ""
+        link: str = ""
+
+        if isinstance(ref, dict) and "title" in ref and ref["title"]:
+            title = ref["title"]
+        elif isinstance(ref, dict) and "Title" in ref and ref["Title"]:  # Handle capitalized version
+            title = ref["Title"]
+
         if isinstance(ref, dict) and "link" in ref and ref["link"]:
-            links.append(ref["link"])
+            link = ref["link"]
         elif isinstance(ref, dict) and "Link" in ref and ref["Link"]:  # Handle capitalized version
-            links.append(ref["Link"])
-
-    return links
+            link = ref["Link"]
+
+        refs.append({"title": title, "link": link})
+    return refs
 
+def extract_title_and_link_from_context(context: str) -> List[Dict[str, Any]]:
+    if not context:
+        return []
+
+    docs = []
+
+    try:
+        docs_obj = json.loads(context)
+        for doc in docs_obj:
+            title: str = ""
+            link: str = ""
+            if isinstance(doc, dict) and "document_title" in doc and doc["document_title"]:
+                title = doc["document_title"]
+
+            if isinstance(doc, dict) and "document_link" in doc and doc["document_link"]:
+                link = doc["document_link"]
+
+            docs.append({"title": title, "link": link})
+    except (json.JSONDecodeError, TypeError) as exc:
+        logging.warning(
+            "Failed to parse context JSON in extract_title_and_link_from_context: %s",
+            exc,
+        )
+    return docs
 
 # class EvaluatorConfig:
 #     """Configuration for an evaluator"""
@@ -127,7 +159,7 @@ def __init__(
         num_to_run: int = 1,
     ):
         self._evaluators = evaluators or {}
-        self._evals_result: EvalsResult = evals_result or EvalsResult(None, {})
+        self._evals_result: EvalsResult = evals_result or EvalsResult(None, {}, None)
         self._num_to_run = num_to_run
         # Initialize the shared cache lazily once
         if EvalsRunner.channel_to_tenant_id_dict is None:
@@ -203,7 +235,6 @@ async def _process_file(self, input_file: str, output_file: str, scenario: str,
                             answer = api_response.get("answer", "")
                             full_context = api_response.get("full_context", "")
                             references = api_response.get("references", [])
-                            reference_urls = extract_links_from_references(references)
                             latency = time.time() - start_time
                             processed_test_data = {
                                 "query": record["query"],
@@ -212,10 +243,14 @@ async def _process_file(self, input_file: str, output_file: str, scenario: str,
                                 "context": full_context,
                                 "latency": latency,
                                 "response_length": len(answer),
-                                "expected_reference_urls": (
-                                    record["expected_reference_urls"] if "expected_reference_urls" in record else []
+                                "expected_knowledges": (
+                                    record["expected_knowledges"] if "expected_knowledges" in record else []
+                                ),
+                                "knowledges": extract_title_and_link_from_context(full_context),
+                                "expected_references": (
+                                    record["expected_references"] if "expected_references" in record else []
                                 ),
-                                "reference_urls": reference_urls,
+                                "references": extract_title_and_link_from_references(references),
                                 "testcase": record.get("testcase", "unknown"),
                             }
                             if processed_test_data:

@@ -1,3 +1,7 @@
 from .evaluator.azure_bot_evaluator import AzureBotEvaluator
+from .evaluator.azure_bot_reference_evaluator import AzureBotReferenceEvaluator
 
-__all__ = ["AzureBotEvaluator"]
+__all__ = [
+    "AzureBotEvaluator",
+    "AzureBotReferenceEvaluator"
+]