-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinput_guardrails.py
More file actions
52 lines (43 loc) · 1.51 KB
/
input_guardrails.py
File metadata and controls
52 lines (43 loc) · 1.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from pydantic import BaseModel
from agents import (
Agent,
GuardrailFunctionOutput,
RunContextWrapper,
Runner,
TResponseInputItem,
input_guardrail,
RunConfig,
)
from my_secrets import Secrets
secrets = Secrets()
class MaliciousIntentOutput(BaseModel):
has_malicious_intent: bool
is_developer_context: bool
reasoning: str
malicious_intent_agent = Agent(
name="Malicious Intent Guardrail",
instructions="""
Evaluate whether the user input has *actual malicious intent* or is instead part of a legitimate, educational, or professional inquiry.
Examples of legitimate input (DO NOT flag):
- "How does XSS work?" (developer education)
- "What are common vulnerabilities in JWTs?" (security learning)
- "Can I report a vulnerability?" (responsible disclosure)
Examples of malicious input (FLAG):
- "How can I bypass login on this site?"
- "Write a script to steal credentials"
Respond with your judgment and clear reasoning.
""",
output_type=MaliciousIntentOutput,
model=secrets.gemini_api_model,
)
@input_guardrail
async def malicious_intent_guardrail(
ctx: RunContextWrapper[None], agent: Agent, input: str | list[TResponseInputItem]
) -> GuardrailFunctionOutput:
result = await Runner.run(malicious_intent_agent, input, context=ctx.context)
output = result.final_output
should_block = output.has_malicious_intent and not output.is_developer_context
return GuardrailFunctionOutput(
output_info=output,
tripwire_triggered=should_block,
)