Source code for bobbot.agents.safety_agent

"""Agent that checks if a message is appropriate for OpenAI's models or not."""

from typing import Optional

from langchain_core.messages import HumanMessage, SystemMessage

from bobbot.agents.llms import llm_gpt4omini_factual, openai_client
from bobbot.utils import get_logger, log_debug_info

logger = get_logger(__name__)



[docs]
async def check_openai_safety(msg_history: str) -> Optional[bool]:
    """Determine whether the message history would be handled well by OpenAI's models.

    Uses OpenAI's moderation API and an LLM.
    Messages containing self-harm related content should be treated seriously.
    Serious threats toward a person or group of people should also be treated seriously.
    However, other NSFW messages should not be censored, and therefore should not be handled by OpenAI.

    Args:
        msg_history: The message history to check.

    Returns:
        Whether OpenAI's models would handle these messages well.
    """
    # Preliminary check with OpenAI's moderation API
    response = openai_client.moderations.create(input=msg_history)
    categories = response.results[0].categories  # https://platform.openai.com/docs/api-reference/moderations/object
    true_categories: list[str] = [category for category, is_true in vars(categories).items() if is_true]
    if (
        categories.self_harm
        or categories.self_harm_instructions
        or categories.self_harm_intent
        or categories.hate_threatening
        or categories.harassment_threatening
        or categories.violence
        or categories.violence_graphic
    ):
        log_debug_info(f"===== Safety agent moderations =====\nFlagged: {true_categories}")
        return True  # Handle these seriously
    elif categories.sexual:
        log_debug_info(f"===== Safety agent moderations =====\nFlagged: {true_categories}")
        return False  # Don't handle these

    # Precise check with LLM (to find cases like cheating on an exam)
    SAFETY_PROMPT = """You are an expert decision maker named Bob chatting in a private Discord server. In this server, being toxic, edgy, using profanity, suggestive jokes, etc. is all acceptable. Your goal is to evaluate whether the most recent message in the chat history is appropriate.

1. For messages that are appropriate, respond with "ACCEPT". Most messages should be accepted.
2. For messages that are inappropriate, immoral, or violate ethical standards, respond with "REJECT".
3. For messages in a gray area, respond with "GRAY".

Here is the server's message history, including the most recent message. You MUST follow the response format of 'Thoughts:', followed by brainstorming thoughts, then 'Answer:', followed by the answer as one of ACCEPT, REJECT, or GRAY. Keep thoughts very concise."""  # noqa: E501"""

    messages = [SystemMessage(content=SAFETY_PROMPT), HumanMessage(content=msg_history)]

    # log_debug_info(f"===== Safety agent history =====\n{messages[1].content}")
    response = await llm_gpt4omini_factual.ainvoke(messages)
    content = response.content
    if true_categories or "ACCEPT" not in content:  # Only log for non-clean messages
        log_debug_info(f"===== Safety agent response =====\nFlagged: {true_categories}\n{content}")
    # Get the decision
    if "ACCEPT" in content:
        return True
    elif "GRAY" in content:
        return True
    elif "REJECT" in content:
        return False
    logger.warning("Decision agent did not output a valid response - defaulting to REJECT.")
    return True