Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
95793bf
bot reference evaluator
chunyu3 Dec 18, 2025
a65cf4a
Merge branch 'main' of https://github.com/Azure/azure-sdk-tools into …
chunyu3 Dec 18, 2025
a07d99e
Merge branch 'main' of https://github.com/Azure/azure-sdk-tools into …
chunyu3 Jan 21, 2026
8b15b6d
use title and url to refer a reference
chunyu3 Jan 21, 2026
931257e
remove unused code
chunyu3 Jan 21, 2026
4883380
change url to link
chunyu3 Jan 22, 2026
4615d51
Merge branch 'main' of https://github.com/Azure/azure-sdk-tools into …
chunyu3 Jan 22, 2026
b81a72f
add knowledges
chunyu3 Jan 22, 2026
8e01809
record knowledges
chunyu3 Jan 22, 2026
019df12
enable evaluation for knowledges from ai search
chunyu3 Jan 22, 2026
375afb4
handle duplicate references
chunyu3 Jan 23, 2026
d85dc4d
handle matched reference in unexpected_references issue
chunyu3 Jan 23, 2026
25397c3
update the evaluation tests to include expected knowledge and references
chunyu3 Jan 23, 2026
8dfcadb
add suppress logic
chunyu3 Jan 23, 2026
406444e
resolve comment
chunyu3 Jan 23, 2026
3722257
show warning
chunyu3 Jan 27, 2026
259b0d7
update test cases
chunyu3 Jan 28, 2026
aae1d45
update the testcases
chunyu3 Jan 30, 2026
d86a928
refine baseline
chunyu3 Feb 1, 2026
a7096a6
refine baseline
chunyu3 Feb 1, 2026
d92b50d
Merge branch 'bot-evals' of https://github.com/chunyu3/azure-sdk-tool…
chunyu3 Feb 2, 2026
06535e4
update the reference and knowledge in evaluation tests
chunyu3 Feb 2, 2026
4480ccb
Merge branch 'bot-evals' of https://github.com/chunyu3/azure-sdk-tool…
chunyu3 Feb 2, 2026
d304d78
re-build baseline
chunyu3 Feb 2, 2026
ba37efa
define verificationResult enum
chunyu3 Feb 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 110 additions & 24 deletions tools/sdk-ai-bots/azure-sdk-qa-bot-evaluation/_evals_result.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from enum import IntFlag, Enum
import json
import logging
import math
Expand All @@ -7,10 +8,45 @@
from tabulate import tabulate


class VerificationResult(Enum):
PASS = "pass"
FAIL = "fail"
PASS_WITH_WARNING = "Pass with Warning"

class EvalReturnCode(IntFlag):
SUCCESS = 0
FAIL = 1
WARNING = 2 << 1

class EvalsResult:
def __init__(self, weights: dict[str, float] | None, metrics: dict[str, list[str] | None]):
def __init__(self, weights: dict[str, float] | None, metrics: dict[str, list[str] | None], suppressions: dict[str, list[str]] | None):
"""
Initialize an EvalsResult instance for managing evaluation results.

Args:
weights: A dictionary mapping metric names to their weights for calculating
overall scores. Keys should be in the format "{metric}_weight".
If None, an empty dictionary is used and metrics are weighted equally.
metrics: A dictionary mapping evaluator names to their output field names.
Each key is an evaluator name, and the value is a list of field names
that the evaluator outputs, or None if only the default field is used.
suppressions: A dictionary specifying which evaluators and test cases should be
suppressed (excluded from failure counting). Expected keys are:
- "evaluators": List of evaluator names to suppress.
- "testcases": List of test case names to suppress.
If None, defaults to empty lists for both.
"""
self._weights = weights or {}
self._metrics = metrics
# Ensure suppressions dict always has lists, never None
self._suppressions: dict[str, list[str]]
if suppressions is None:
self._suppressions = {"evaluators": [], "testcases": []}
else:
self._suppressions = {
"evaluators": suppressions.get("evaluators") or [],
"testcases": suppressions.get("testcases") or []
}

def calculate_overall_score(self, row: dict[str, Any]) -> float:
"""Calculate weighted score based on various metrics."""
Expand All @@ -20,11 +56,12 @@ def calculate_overall_score(self, row: dict[str, Any]) -> float:
for metric in metrics:
metric_key = f"outputs.{metric}.{metric}"
if metric_key not in row:
return 0.0
logging.info(f"calculate_overall_score key:{metric_key} does not exits")
overall_score += 0.0
else:
score = float(row[metric_key])
if math.isnan(score):
return 0.0
overall_score += 0.0
if f"{metric}_weight" in self._weights:
overall_score += score * self._weights[f"{metric}_weight"]
else:
Expand Down Expand Up @@ -52,9 +89,14 @@ def record_run_result(self, result: dict[str, Any]) -> list[dict[str, Any]]:
row_result["testcase"] = row["inputs.testcase"]
row_result["expected"] = {
"answer": row["inputs.ground_truth"],
"reference_urls": row["inputs.expected_reference_urls"],
"references": row["inputs.expected_references"],
"knowledges": row["inputs.expected_knowledges"],
}
row_result["actual"] = {
"answer": row["inputs.response"],
"references": row["inputs.references"],
"knowledges": row["inputs.knowledges"],
}
row_result["actual"] = {"answer": row["inputs.response"], "reference_urls": row["inputs.reference_urls"]}
pattern = r"^outputs\.(\w+)\.(\w+)$"
for index, (key, value) in enumerate(row.items()):
match = re.match(pattern, key)
Expand All @@ -64,7 +106,10 @@ def record_run_result(self, result: dict[str, Any]) -> list[dict[str, Any]]:
logging.debug(f"Metric: {metric}, Name: {metric_name}")
if key == f"outputs.{metric}.{metric}_result":
if value == "fail":
fail_rates[metric] += 1
if row["inputs.testcase"] not in self._suppressions["testcases"]:
fail_rates[metric] += 1
else:
pass_rates[metric] += 1 #record to pass rate if the testcase is suppressed.
if value == "pass":
pass_rates[metric] += 1

Expand All @@ -90,7 +135,7 @@ def record_run_result(self, result: dict[str, Any]) -> list[dict[str, Any]]:
return run_result

@classmethod
def format_terminal_diff(cls, new: float, old: float, format_str: str = ".1f", reverse: bool = False) -> str:
def format_terminal_diff(cls, new: float, old: float, format_str: str = ".2f", reverse: bool = False) -> str:
"""Format difference with ANSI colors for terminal output."""

diff = new - old
Expand Down Expand Up @@ -139,10 +184,10 @@ def build_output_table(
base_score = base[f"{metric}"] if f"{metric}" in base else None
if base_score is not None:
values.append(
f"{metric_score:.1f}{EvalsResult.format_terminal_diff(metric_score, float(base_score))}"
f"{metric_score:.2f}{EvalsResult.format_terminal_diff(metric_score, float(base_score))}"
)
else:
values.append(f"{metric_score:.1f}")
values.append(f"{metric_score:.2f}")
fields = self._metrics[metric]
if fields:
for field in fields:
Expand All @@ -152,11 +197,11 @@ def build_output_table(
else:
metric_result = result[f"{metric}_result"] if f"{metric}_result" in result else "N/A"
values.append(f"{metric_result}")
values.append(f"{score:.1f}{EvalsResult.format_terminal_diff(score, float(base['overall_score']))}")
values.append(f"{score:.2f}{EvalsResult.format_terminal_diff(score, float(base['overall_score']))}")
else:
for metric in metrics:
metric_score = result[f"{metric}"] if f"{metric}" in result else -1
values.append(f"{metric_score:.1f}")
values.append(f"{metric_score:.2f}")
fields = self._metrics[metric]
if fields:
for field in fields:
Expand All @@ -166,7 +211,7 @@ def build_output_table(
else:
metric_result = result[f"{metric}_result"] if f"{metric}_result" in result else "N/A"
values.append(f"{metric_result}")
values.append(f"{score:.1f}")
values.append(f"{score:.2f}")

terminal_row.extend(values)
terminal_rows.append(terminal_row)
Expand Down Expand Up @@ -213,12 +258,23 @@ def show_results(self, all_results: dict[str, Any], with_baseline: bool = True)

self.output_table(test_results, name, baseline_results)

def verify_results(self, all_results: dict[str, Any], with_baseline: bool = True) -> bool:
ret = True
def verify_results(self, all_results: dict[str, Any], with_baseline: bool = True) -> VerificationResult:
"""
Verify evaluation results against baseline and suppression rules.

Args:
all_results: A dictionary mapping scenario names to their evaluation results (list of dicts).
with_baseline: Whether to compare results with baseline data (default: True).

Returns:
int: 0 if all scenarios pass, 1 if any scenario fails, 2 if all pass but some have warnings (suppressed failures).
"""
ret = VerificationResult.PASS
failed_scenarios = []
warning_scenarios = []
metrics = self._metrics.keys()
for name, test_results in all_results.items():
scenario_ret = True
scenario_ret = EvalReturnCode.SUCCESS

if with_baseline:
baseline_results = {}
Expand All @@ -232,53 +288,83 @@ def verify_results(self, all_results: dict[str, Any], with_baseline: bool = True
baseline_results[result["testcase"]] = result
baseline_results["average_score"] = baseline_data[-1]["average_score"]
if test_results[-1]["average_score"] < baseline_data[-1]["average_score"]:
# scenario_ret = False //ignore decrease in average score
logging.warning(f"scenario {name} avarage score decrease!")
warning = True

for metric in metrics:
pass_rate = test_results[-1][f"{metric}_pass_rate"] if f"{metric}_pass_rate" in test_results[-1] else 0
fail_rate = test_results[-1][f"{metric}_fail_rate"] if f"{metric}_fail_rate" in test_results[-1] else 0
# workaround: for groundedness, only caculate the `fail`
metric_ret = EvalReturnCode.SUCCESS
if metric == "groundedness":
if fail_rate > 0:
scenario_ret = False
metric_ret = EvalReturnCode.FAIL
else:
if pass_rate < test_results[-1]["total_evals"]:
scenario_ret = False
metric_ret = EvalReturnCode.FAIL
if metric in self._suppressions["evaluators"] and metric_ret == EvalReturnCode.FAIL:
metric_ret = EvalReturnCode.WARNING #fallback to warning if evaluator is suppressed
scenario_ret = scenario_ret | metric_ret

if not scenario_ret:
if scenario_ret & EvalReturnCode.FAIL:
failed_scenarios.append(name)
ret = False
ret = VerificationResult.FAIL # failed
elif scenario_ret & EvalReturnCode.WARNING:
warning_scenarios.append(name)
ret = VerificationResult.PASS_WITH_WARNING # succeed with warning

if failed_scenarios:
logging.info(f"Failed Scenarios: {' '.join(failed_scenarios)}")
elif warning_scenarios:
logging.info(f"Scenarios with warnings: {' '.join(warning_scenarios)}")
else:
logging.info(f"All scenarios passed without issues.")

return ret

def establish_baseline(self, all_results: dict[str, Any], is_ci: bool) -> None:
"""Establish the current results as the new baseline."""

def filter_keys_recursive(obj: Any, exclude_keys: list[str]) -> Any:
"""Recursively filter out keys from nested dictionaries."""
if isinstance(obj, dict):
return {k: filter_keys_recursive(v, exclude_keys)
for k, v in obj.items() if k not in exclude_keys}
elif isinstance(obj, list):
return [filter_keys_recursive(item, exclude_keys) for item in obj]
return obj

# only ask if we're not in CI
exclude_keys = ["knowledge_match_exact_matches",
"knowledge_match_unexpected_refs",
"knowledge_match_missing_refs",
"reference_match_exact_matches",
"reference_match_unexpected_refs",
"reference_match_missing_refs",
"knowledges"]
if is_ci is False:
establish_baseline = input("\nDo you want to establish this as the new baseline? (y/n): ")
if establish_baseline.lower() == "y":
for name, result in all_results.items():
# result is a list of dicts, filter each dict recursively
partial_result = filter_keys_recursive(result, exclude_keys)
baseline_name = f"{name.split('_')[0]}-test.json"
baseline_path = pathlib.Path(__file__).parent / "results" / baseline_name
with open(str(baseline_path), "w") as f:
json.dump(result, indent=4, fp=f)
json.dump(partial_result, indent=4, fp=f)

# whether or not we establish a baseline, we want to write results to a temp dir
log_path = pathlib.Path(__file__).parent / "results" / ".log"
if not log_path.exists():
log_path.mkdir(parents=True, exist_ok=True)

for name, result in all_results.items():
# result is a list of dicts, filter each dict recursively
partial_result = filter_keys_recursive(result, exclude_keys)
baseline_name = f"{name.split('_')[0]}-test.json"
output_path = log_path / baseline_name
with open(str(output_path), "w") as f:
json.dump(result, indent=4, fp=f)
json.dump(partial_result, indent=4, fp=f)


__all__ = ["EvalsResult"]
__all__ = ["EvalsResult", "VerificationResult"]
59 changes: 47 additions & 12 deletions tools/sdk-ai-bots/azure-sdk-qa-bot-evaluation/_evals_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@
import aiohttp
import yaml


def extract_links_from_references(references: List[Dict[str, Any]]) -> List[str]:
def extract_title_and_link_from_references(references: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Map an array of reference objects to a string array of their 'link' properties.

Expand All @@ -31,15 +30,48 @@ def extract_links_from_references(references: List[Dict[str, Any]]) -> List[str]
if not references:
return []

links = []
refs = []
for ref in references:
title: str = ""
link: str = ""

if isinstance(ref, dict) and "title" in ref and ref["title"]:
title = ref["title"]
elif isinstance(ref, dict) and "Title" in ref and ref["Title"]: # Handle capitalized version
title = ref["Title"]

if isinstance(ref, dict) and "link" in ref and ref["link"]:
links.append(ref["link"])
link = ref["link"]
elif isinstance(ref, dict) and "Link" in ref and ref["Link"]: # Handle capitalized version
links.append(ref["Link"])

return links
link = ref["Link"]

refs.append({"title": title, "link": link})
return refs

def extract_title_and_link_from_context(context: str) -> List[Dict[str, Any]]:
if not context:
return []

docs = []

try:
docs_obj = json.loads(context)
for doc in docs_obj:
title: str = ""
link: str = ""
if isinstance(doc, dict) and "document_title" in doc and doc["document_title"]:
title = doc["document_title"]

if isinstance(doc, dict) and "document_link" in doc and doc["document_link"]:
link = doc["document_link"]

docs.append({"title": title, "link": link})
except (json.JSONDecodeError, TypeError) as exc:
logging.warning(
"Failed to parse context JSON in extract_title_and_link_from_context: %s",
exc,
)
return docs

# class EvaluatorConfig:
# """Configuration for an evaluator"""
Expand Down Expand Up @@ -127,7 +159,7 @@ def __init__(
num_to_run: int = 1,
):
self._evaluators = evaluators or {}
self._evals_result: EvalsResult = evals_result or EvalsResult(None, {})
self._evals_result: EvalsResult = evals_result or EvalsResult(None, {}, None)
self._num_to_run = num_to_run
# Initialize the shared cache lazily once
if EvalsRunner.channel_to_tenant_id_dict is None:
Expand Down Expand Up @@ -203,7 +235,6 @@ async def _process_file(self, input_file: str, output_file: str, scenario: str,
answer = api_response.get("answer", "")
full_context = api_response.get("full_context", "")
references = api_response.get("references", [])
reference_urls = extract_links_from_references(references)
latency = time.time() - start_time
processed_test_data = {
"query": record["query"],
Expand All @@ -212,10 +243,14 @@ async def _process_file(self, input_file: str, output_file: str, scenario: str,
"context": full_context,
"latency": latency,
"response_length": len(answer),
"expected_reference_urls": (
record["expected_reference_urls"] if "expected_reference_urls" in record else []
"expected_knowledges": (
record["expected_knowledges"] if "expected_knowledges" in record else []
),
"knowledges": extract_title_and_link_from_context(full_context),
"expected_references": (
record["expected_references"] if "expected_references" in record else []
),
"reference_urls": reference_urls,
"references": extract_title_and_link_from_references(references),
"testcase": record.get("testcase", "unknown"),
}
if processed_test_data:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
from .evaluator.azure_bot_evaluator import AzureBotEvaluator
from .evaluator.azure_bot_reference_evaluator import AzureBotReferenceEvaluator

__all__ = ["AzureBotEvaluator"]
__all__ = [
"AzureBotEvaluator",
"AzureBotReferenceEvaluator"
]
Loading