VectorDB Search limit

2026-02-03 19:24:34 -05:00 · 2026-01-26 18:09:03 +00:00
parent f941d8342b
commit 4397c2913f
2 changed files with 31 additions and 33 deletions
--- a/backend/src/mongo/connection.py
+++ b/backend/src/mongo/connection.py
@@ -1,8 +1,9 @@
 import os 
 from pymongo import MongoClient 

-def get_mongo_client ():
-    uri =os .environ .get ("MONGO_URI")
-    if not uri :
-        raise ValueError ("MONGO_URI environment variable not set")
-    return MongoClient (uri )
+def get_mongo_client():
+    uri = os.environ.get("MONGO_URI")
+    if not uri:
+        raise ValueError("MONGO_URI environment variable not set")
+    print("Connecting to MongoDB Atlas...")
+    return MongoClient(uri, serverSelectionTimeoutMS=5000, connectTimeoutMS=10000, socketTimeoutMS=10000)
--- a/backend/src/routes/gemini.py
+++ b/backend/src/routes/gemini.py
@@ -2,16 +2,12 @@ from flask import Blueprint, request, jsonify
 from src.rag.gemeni import GeminiClient
 from src.gemini import ask_gemini_with_rag
 from src.chroma.vector_store import search_documents
+from src.chroma.chroma_store import get_collection
 from src.rag.embeddings import get_embedding

-gemini_bp = Blueprint('gemini', __name__)
-brain = None
+# ... (imports)

-def get_brain():
-    global brain
-    if brain is None:
-        brain = GeminiClient()
-    return brain
+gemini_bp = Blueprint('gemini', __name__)

@gemini_bp.route('/ask', methods=['POST'])
 def ask():
@@ -27,33 +23,34 @@ def ask():
        print(f"Generating embedding for prompt: {prompt}")
        query_embedding = get_embedding(prompt)
        
-        print("Searching Vector Database for context...")
-        search_results = search_documents(query_embedding, num_results=50)
+        # Maximize context retrieval (20 chunks)
+        print("Searching Vector Database for context (limit=20)...")
+        search_results = search_documents(query_embedding, num_results=20)
        
-        # Special handling for Georgetown University queries to ensure those docs are included
-        # even if generic corporate reports outrank them in vector search.
+        # KEYWORD BOOST: Specifically check for Georgetown documents if mentioned
+        # This ensures we get specific entity hits even if semantic similarity is weak for chunks
        if "georgetown" in prompt.lower():
            try:
-                from src.mongo.connection import get_mongo_client
-                client = get_mongo_client()
-                # Use hardcoded DB name to match vector_store.py
-                db = client["ethix_vectors"]
-                collection = db["rag_documents"]
+                print("Applied keyword boost: 'Georgetown'")
+                coll = get_collection()
+                # ChromaDB text search filter
+                gt_results = coll.get(
+                    where_document={"$contains": "Georgetown"},
+                    limit=20
+                )
                
-                # Fetch docs with Georgetown or MAP_INFO in the filename/source
-                gt_docs = list(collection.find({"source": {"$regex": "Georgetown|MAP_INFO", "$options": "i"}}).limit(30))
-                
-                if gt_docs:
-                    print(f"Direct Match: Found {len(gt_docs)} Georgetown specific documents.")
-                    for doc in gt_docs:
-                        # Normalize to match search_results format
+                if gt_results and gt_results['documents']:
+                    print(f"Found {len(gt_results['documents'])} keyword-matched documents.")
+                    for i, text in enumerate(gt_results['documents']):
+                        # Add to results if not already present (check by text content hash or just duplication)
+                        # We'll just append with high score, duplicates handled by set later or LLM tolerance
                        search_results.append({
-                            "text": doc.get("text", ""),
-                            "metadata": doc.get("metadata", {"source": doc.get("source", "Georgetown File")}),
-                            "score": 1.0 # High priority
+                            "text": text,
+                            "metadata": gt_results['metadatas'][i] if gt_results['metadatas'] else {},
+                            "score": 1.0
                        })
            except Exception as e:
-                print(f"Error checking Georgetown docs: {e}")
+                print(f"Keyword boost failed: {e}")

        retrieved_context = ""
        if search_results:
@@ -87,7 +84,7 @@ def ask():
                full_context = retrieved_context

        # Step 3: Ask Gemini
-        client = get_brain()
+        client = GeminiClient()
        response = client.ask(prompt, full_context)
        
        return jsonify({