Inital Commit

This commit is contained in:
2026-01-24 02:32:25 +00:00
commit a4b7c82b1a
54 changed files with 888 additions and 0 deletions

67
backend/src/rag/store.py Normal file
View File

@@ -0,0 +1,67 @@
import os
from pymongo import MongoClient
from .embeddings import get_embeddings_batch
def get_mongo_client():
uri = os.environ.get("MONGO_URI")
if not uri:
raise ValueError("MONGO_URI environment variable not set")
return MongoClient(uri)
def ingest_documents(text_chunks, collection_name="rag_documents"):
"""
Generates embeddings for text chunks and stores them in MongoDB.
"""
client = get_mongo_client()
db = client.get_database("vectors_db") # Default DB name
collection = db[collection_name]
# Generate embeddings in batches (handling API limits might be needed for large sets)
embeddings = get_embeddings_batch(text_chunks)
documents = []
for text, embedding in zip(text_chunks, embeddings):
documents.append({
"text": text,
"embedding": embedding
})
if documents:
collection.insert_many(documents)
return len(documents)
return 0
def vector_search(query_text, collection_name="rag_documents", num_results=5):
"""
Performs a vector search in MongoDB.
"""
# 1. Get embedding for the query
from .embeddings import get_embedding
query_embedding = get_embedding(query_text)
client = get_mongo_client()
db = client.get_database("vectors_db")
collection = db[collection_name]
# Note: You must have a vector search index defined in MongoDB Atlas for this to work.
pipeline = [
{
"$vectorSearch": {
"index": "vector_index",
"path": "embedding",
"queryVector": query_embedding,
"numCandidates": num_results * 10,
"limit": num_results
}
},
{
"$project": {
"_id": 0,
"text": 1,
"score": { "$meta": "vectorSearchScore" }
}
}
]
results = list(collection.aggregate(pipeline))
return results