From 2d8e07d6a252263cc454dac4ba80c24b39c51e1d Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Wed, 12 Nov 2025 14:43:35 +0000
Subject: [PATCH] Fix: Increase LLM_CHUNK_SIZE to fix bug in Kekzal side
 effects response

This commit fixes a bug where the chatbot would fail to answer questions about the side effects of the Kekzal product. The bug was caused by the `LLM_CHUNK_SIZE` being too small, which resulted in the side effects information being separated from the product name during the text splitting process.

This commit increases the `LLM_CHUNK_SIZE` from 512 to 2000 in `app/api/config.py` to ensure that the relevant information is kept together in the same chunk.

A test case has been added in `app/api/tests/test_llm.py` to verify the fix.
---
 app/api/__init__.py       |  0
 app/api/config.py         |  2 +-
 app/api/llm.py            |  8 ++--
 app/api/tests/__init__.py |  0
 app/api/tests/test_llm.py | 83 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 88 insertions(+), 5 deletions(-)
 create mode 100644 app/api/__init__.py
 create mode 100644 app/api/tests/__init__.py
 create mode 100644 app/api/tests/test_llm.py

diff --git a/app/api/__init__.py b/app/api/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/api/config.py b/app/api/config.py
index 48ef59d..52599e4 100644
--- a/app/api/config.py
+++ b/app/api/config.py
@@ -84,7 +84,7 @@
 # LLM configurations
 MODEL_NAME = os.getenv("MODEL_NAME")
 LLM_DEFAULT_TEMPERATURE = float(os.getenv("LLM_DEFAULT_TEMPERATURE", 0.0))
-LLM_CHUNK_SIZE = int(os.getenv("LLM_CHUNK_SIZE", 512))
+LLM_CHUNK_SIZE = int(os.getenv("LLM_CHUNK_SIZE", 2000))
 LLM_CHUNK_OVERLAP = int(os.getenv("LLM_CHUNK_OVERLAP", 20))
 LLM_DISTANCE_THRESHOLD = float(os.getenv("LLM_DISTANCE_THRESHOLD", 0.5))
 LLM_MAX_OUTPUT_TOKENS = int(os.getenv("LLM_MAX_OUTPUT_TOKENS", 256))
diff --git a/app/api/llm.py b/app/api/llm.py
index eb4404e..a155eeb 100644
--- a/app/api/llm.py
+++ b/app/api/llm.py
@@ -2,11 +2,11 @@
 import openai
 import json
 
-from langchain.docstore.document import Document as LangChainDocument
-from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain_core.documents import Document as LangChainDocument
+from langchain_openai import OpenAIEmbeddings
 from fastapi import HTTPException
 from uuid import UUID, uuid4
-from langchain.text_splitter import (
+from langchain_text_splitters import (
     CharacterTextSplitter,
     MarkdownTextSplitter
 )
@@ -18,7 +18,7 @@
     sanitize_input,
     sanitize_output
 )
-from langchain import OpenAI
+from langchain_openai import OpenAI
 from typing import (
     List,
     Union,
diff --git a/app/api/tests/__init__.py b/app/api/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/api/tests/test_llm.py b/app/api/tests/test_llm.py
new file mode 100644
index 0000000..ea39f2b
--- /dev/null
+++ b/app/api/tests/test_llm.py
@@ -0,0 +1,83 @@
+
+import sys
+import os
+import pytest
+from sqlmodel import Session, SQLModel, create_engine
+# Add the app directory to the sys.path
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from llm import chat_query, get_embeddings
+from models import Organization, Project, Document, Node
+from config import DATABASE_URL
+
+
+# Create a test database
+engine = create_engine(DATABASE_URL, echo=False)
+
+def create_db_and_tables():
+    SQLModel.metadata.create_all(engine)
+
+@pytest.fixture(name="session")
+def session_fixture():
+    create_db_and_tables()
+    with Session(engine) as session:
+        yield session
+    SQLModel.metadata.drop_all(engine)
+
+
+def test_kekzal_side_effects_bug(session: Session):
+    """
+    This test reproduces the bug where the chatbot fails to answer a question
+    about the side effects of Kekzal.
+    """
+    # 1. Create dummy organization and project
+    org = Organization(display_name="Test Org", namespace="test-org")
+    session.add(org)
+    session.commit()
+    session.refresh(org)
+
+    project = Project(display_name="Test Project", organization_id=org.id)
+    session.add(project)
+    session.commit()
+    session.refresh(project)
+
+    # 2. Load the Kekzal document
+    doc_path = "app/api/data/training_data/project-kekzal.md"
+    with open(doc_path, "r") as f:
+        doc_content = f.read()
+
+    doc = Document(
+        display_name="project-kekzal.md",
+        project_id=project.id,
+        organization_id=org.id,
+        data=doc_content,
+        hash="testhash",
+        version=1,
+    )
+    session.add(doc)
+    session.commit()
+    session.refresh(doc)
+
+    # 3. Create embeddings for the document
+    arr_documents, embeddings = get_embeddings(doc_content)
+    for i, (doc_chunk, embedding) in enumerate(zip(arr_documents, embeddings)):
+        node = Node(
+            document_id=doc.id,
+            text=doc_chunk,
+            embeddings=embedding,
+            node_order=i,
+        )
+        session.add(node)
+    session.commit()
+
+
+    # 4. Ask the question about side effects
+    response = chat_query(
+        query_str="What are the side effects of Kekzal?",
+        session=session,
+        project=project,
+        organization=org,
+    )
+
+    # 5. Assert that the response is correct
+    correct_response = "Some potential side effects of Kekzal may include"
+    assert correct_response in response.response