VectorDB Search limit

This commit is contained in:
2026-01-26 18:09:03 +00:00
parent f941d8342b
commit 4397c2913f
2 changed files with 31 additions and 33 deletions

View File

@@ -5,4 +5,5 @@ def get_mongo_client ():
uri = os.environ.get("MONGO_URI")
if not uri:
raise ValueError("MONGO_URI environment variable not set")
return MongoClient (uri )
print("Connecting to MongoDB Atlas...")
return MongoClient(uri, serverSelectionTimeoutMS=5000, connectTimeoutMS=10000, socketTimeoutMS=10000)

View File

@@ -2,16 +2,12 @@ from flask import Blueprint, request, jsonify
from src.rag.gemeni import GeminiClient
from src.gemini import ask_gemini_with_rag
from src.chroma.vector_store import search_documents
from src.chroma.chroma_store import get_collection
from src.rag.embeddings import get_embedding
gemini_bp = Blueprint('gemini', __name__)
brain = None
# ... (imports)
def get_brain():
global brain
if brain is None:
brain = GeminiClient()
return brain
gemini_bp = Blueprint('gemini', __name__)
@gemini_bp.route('/ask', methods=['POST'])
def ask():
@@ -27,33 +23,34 @@ def ask():
print(f"Generating embedding for prompt: {prompt}")
query_embedding = get_embedding(prompt)
print("Searching Vector Database for context...")
search_results = search_documents(query_embedding, num_results=50)
# Maximize context retrieval (20 chunks)
print("Searching Vector Database for context (limit=20)...")
search_results = search_documents(query_embedding, num_results=20)
# Special handling for Georgetown University queries to ensure those docs are included
# even if generic corporate reports outrank them in vector search.
# KEYWORD BOOST: Specifically check for Georgetown documents if mentioned
# This ensures we get specific entity hits even if semantic similarity is weak for chunks
if "georgetown" in prompt.lower():
try:
from src.mongo.connection import get_mongo_client
client = get_mongo_client()
# Use hardcoded DB name to match vector_store.py
db = client["ethix_vectors"]
collection = db["rag_documents"]
print("Applied keyword boost: 'Georgetown'")
coll = get_collection()
# ChromaDB text search filter
gt_results = coll.get(
where_document={"$contains": "Georgetown"},
limit=20
)
# Fetch docs with Georgetown or MAP_INFO in the filename/source
gt_docs = list(collection.find({"source": {"$regex": "Georgetown|MAP_INFO", "$options": "i"}}).limit(30))
if gt_docs:
print(f"Direct Match: Found {len(gt_docs)} Georgetown specific documents.")
for doc in gt_docs:
# Normalize to match search_results format
if gt_results and gt_results['documents']:
print(f"Found {len(gt_results['documents'])} keyword-matched documents.")
for i, text in enumerate(gt_results['documents']):
# Add to results if not already present (check by text content hash or just duplication)
# We'll just append with high score, duplicates handled by set later or LLM tolerance
search_results.append({
"text": doc.get("text", ""),
"metadata": doc.get("metadata", {"source": doc.get("source", "Georgetown File")}),
"score": 1.0 # High priority
"text": text,
"metadata": gt_results['metadatas'][i] if gt_results['metadatas'] else {},
"score": 1.0
})
except Exception as e:
print(f"Error checking Georgetown docs: {e}")
print(f"Keyword boost failed: {e}")
retrieved_context = ""
if search_results:
@@ -87,7 +84,7 @@ def ask():
full_context = retrieved_context
# Step 3: Ask Gemini
client = get_brain()
client = GeminiClient()
response = client.ask(prompt, full_context)
return jsonify({