Inital Commit

2026-02-04 11:44:34 -05:00 · 2026-01-24 02:32:25 +00:00
commit a4b7c82b1a
54 changed files with 888 additions and 0 deletions
--- a/backend/src/rag/store.py
+++ b/backend/src/rag/store.py
@@ -0,0 +1,67 @@
+import os
+from pymongo import MongoClient
+from .embeddings import get_embeddings_batch
+
+def get_mongo_client():
+    uri = os.environ.get("MONGO_URI")
+    if not uri:
+        raise ValueError("MONGO_URI environment variable not set")
+    return MongoClient(uri)
+
+def ingest_documents(text_chunks, collection_name="rag_documents"):
+    """
+    Generates embeddings for text chunks and stores them in MongoDB.
+    """
+    client = get_mongo_client()
+    db = client.get_database("vectors_db") # Default DB name
+    collection = db[collection_name]
+    
+    # Generate embeddings in batches (handling API limits might be needed for large sets)
+    embeddings = get_embeddings_batch(text_chunks)
+    
+    documents = []
+    for text, embedding in zip(text_chunks, embeddings):
+        documents.append({
+            "text": text,
+            "embedding": embedding
+        })
+    
+    if documents:
+        collection.insert_many(documents)
+        return len(documents)
+    return 0
+
+def vector_search(query_text, collection_name="rag_documents", num_results=5):
+    """
+    Performs a vector search in MongoDB.
+    """
+    # 1. Get embedding for the query
+    from .embeddings import get_embedding
+    query_embedding = get_embedding(query_text)
+    
+    client = get_mongo_client()
+    db = client.get_database("vectors_db")
+    collection = db[collection_name]
+    
+    # Note: You must have a vector search index defined in MongoDB Atlas for this to work.
+    pipeline = [
+        {
+            "$vectorSearch": {
+                "index": "vector_index",
+                "path": "embedding",
+                "queryVector": query_embedding,
+                "numCandidates": num_results * 10,
+                "limit": num_results
+            }
+        },
+        {
+            "$project": {
+                "_id": 0,
+                "text": 1,
+                "score": { "$meta": "vectorSearchScore" }
+            }
+        }
+    ]
+    
+    results = list(collection.aggregate(pipeline))
+    return results