mirror of
https://github.com/SirBlobby/Hoya26.git
synced 2026-02-04 11:44:34 -05:00
Inital Commit
This commit is contained in:
67
backend/src/rag/store.py
Normal file
67
backend/src/rag/store.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import os
|
||||
from pymongo import MongoClient
|
||||
from .embeddings import get_embeddings_batch
|
||||
|
||||
def get_mongo_client():
|
||||
uri = os.environ.get("MONGO_URI")
|
||||
if not uri:
|
||||
raise ValueError("MONGO_URI environment variable not set")
|
||||
return MongoClient(uri)
|
||||
|
||||
def ingest_documents(text_chunks, collection_name="rag_documents"):
|
||||
"""
|
||||
Generates embeddings for text chunks and stores them in MongoDB.
|
||||
"""
|
||||
client = get_mongo_client()
|
||||
db = client.get_database("vectors_db") # Default DB name
|
||||
collection = db[collection_name]
|
||||
|
||||
# Generate embeddings in batches (handling API limits might be needed for large sets)
|
||||
embeddings = get_embeddings_batch(text_chunks)
|
||||
|
||||
documents = []
|
||||
for text, embedding in zip(text_chunks, embeddings):
|
||||
documents.append({
|
||||
"text": text,
|
||||
"embedding": embedding
|
||||
})
|
||||
|
||||
if documents:
|
||||
collection.insert_many(documents)
|
||||
return len(documents)
|
||||
return 0
|
||||
|
||||
def vector_search(query_text, collection_name="rag_documents", num_results=5):
|
||||
"""
|
||||
Performs a vector search in MongoDB.
|
||||
"""
|
||||
# 1. Get embedding for the query
|
||||
from .embeddings import get_embedding
|
||||
query_embedding = get_embedding(query_text)
|
||||
|
||||
client = get_mongo_client()
|
||||
db = client.get_database("vectors_db")
|
||||
collection = db[collection_name]
|
||||
|
||||
# Note: You must have a vector search index defined in MongoDB Atlas for this to work.
|
||||
pipeline = [
|
||||
{
|
||||
"$vectorSearch": {
|
||||
"index": "vector_index",
|
||||
"path": "embedding",
|
||||
"queryVector": query_embedding,
|
||||
"numCandidates": num_results * 10,
|
||||
"limit": num_results
|
||||
}
|
||||
},
|
||||
{
|
||||
"$project": {
|
||||
"_id": 0,
|
||||
"text": 1,
|
||||
"score": { "$meta": "vectorSearchScore" }
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
results = list(collection.aggregate(pipeline))
|
||||
return results
|
||||
Reference in New Issue
Block a user