mirror of
https://github.com/SirBlobby/Hoya26.git
synced 2026-02-04 03:34:34 -05:00
59 lines
1.6 KiB
Python
59 lines
1.6 KiB
Python
import os
|
|
from pymongo import MongoClient
|
|
from .embeddings import get_embeddings_batch
|
|
|
|
def get_mongo_client():
|
|
uri = os.environ.get("MONGO_URI")
|
|
if not uri:
|
|
raise ValueError("MONGO_URI environment variable not set")
|
|
return MongoClient(uri)
|
|
|
|
def ingest_documents(text_chunks, collection_name="rag_documents"):
|
|
client = get_mongo_client()
|
|
db = client.get_database("vectors_db")
|
|
collection = db[collection_name]
|
|
|
|
embeddings = get_embeddings_batch(text_chunks)
|
|
|
|
documents = []
|
|
for text, embedding in zip(text_chunks, embeddings):
|
|
documents.append({
|
|
"text": text,
|
|
"embedding": embedding
|
|
})
|
|
|
|
if documents:
|
|
collection.insert_many(documents)
|
|
return len(documents)
|
|
return 0
|
|
|
|
def vector_search(query_text, collection_name="rag_documents", num_results=5):
|
|
from .embeddings import get_embedding
|
|
query_embedding = get_embedding(query_text)
|
|
|
|
client = get_mongo_client()
|
|
db = client.get_database("vectors_db")
|
|
collection = db[collection_name]
|
|
|
|
pipeline = [
|
|
{
|
|
"$vectorSearch": {
|
|
"index": "vector_index",
|
|
"path": "embedding",
|
|
"queryVector": query_embedding,
|
|
"numCandidates": num_results * 10,
|
|
"limit": num_results
|
|
}
|
|
},
|
|
{
|
|
"$project": {
|
|
"_id": 0,
|
|
"text": 1,
|
|
"score": { "$meta": "vectorSearchScore" }
|
|
}
|
|
}
|
|
]
|
|
|
|
results = list(collection.aggregate(pipeline))
|
|
return results
|