diff --git a/backend/scripts/populate_db.py b/backend/scripts/populate_db.py new file mode 100644 index 0000000..72d40f2 --- /dev/null +++ b/backend/scripts/populate_db.py @@ -0,0 +1,52 @@ +import os +import sys +from pathlib import Path + +# Add backend directory to path so we can import src +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) + +from dotenv import load_dotenv +load_dotenv() + +from src.rag.ingest import process_file +from src.rag.store import ingest_documents +from src.mongo.vector_store import is_file_processed, log_processed_file + +def populate_from_dataset(dataset_dir): + dataset_path = Path(dataset_dir) + if not dataset_path.exists(): + print(f"Dataset directory not found: {dataset_dir}") + return + + print(f"Scanning {dataset_dir}...") + + total_chunks = 0 + files_processed = 0 + + for file_path in dataset_path.glob('*'): + if file_path.is_file() and file_path.suffix.lower() in ['.csv', '.pdf']: + if is_file_processed(file_path.name): + print(f"Skipping {file_path.name} (already processed)") + continue + + print(f"Processing {file_path.name}...") + try: + chunks = process_file(str(file_path)) + if chunks: + count = ingest_documents(chunks) + print(f" Ingested {count} chunks.") + if count > 0: + log_processed_file(file_path.name) + total_chunks += count + files_processed += 1 + else: + print(" No text found/extracted.") + except Exception as e: + print(f" Error processing file: {e}") + + print(f"\nFinished! Processed {files_processed} files. Total chunks ingested: {total_chunks}") + +if __name__ == "__main__": + # Assuming run from backend/ + dataset_dir = os.path.join(os.path.dirname(__file__), '../dataset') + populate_from_dataset(dataset_dir) diff --git a/backend/src/mongo/connection.py b/backend/src/mongo/connection.py new file mode 100644 index 0000000..d868d23 --- /dev/null +++ b/backend/src/mongo/connection.py @@ -0,0 +1,8 @@ +import os +from pymongo import MongoClient + +def get_mongo_client(): + uri = os.environ.get("MONGO_URI") + if not uri: + raise ValueError("MONGO_URI environment variable not set") + return MongoClient(uri) diff --git a/backend/src/mongo/vector_store.py b/backend/src/mongo/vector_store.py new file mode 100644 index 0000000..c1162f6 --- /dev/null +++ b/backend/src/mongo/vector_store.py @@ -0,0 +1,49 @@ +from .connection import get_mongo_client + +def insert_rag_documents(documents, collection_name="rag_documents", db_name="vectors_db"): + client = get_mongo_client() + db = client.get_database(db_name) + collection = db[collection_name] + + if documents: + result = collection.insert_many(documents) + return len(result.inserted_ids) + return 0 + +def search_rag_documents(query_embedding, collection_name="rag_documents", db_name="vectors_db", num_results=5): + client = get_mongo_client() + db = client.get_database(db_name) + collection = db[collection_name] + + pipeline = [ + { + "$vectorSearch": { + "index": "vector_index", + "path": "embedding", + "queryVector": query_embedding, + "numCandidates": num_results * 10, + "limit": num_results + } + }, + { + "$project": { + "_id": 0, + "text": 1, + "score": { "$meta": "vectorSearchScore" } + } + } + ] + + return list(collection.aggregate(pipeline)) + +def is_file_processed(filename, log_collection="ingested_files", db_name="vectors_db"): + client = get_mongo_client() + db = client.get_database(db_name) + collection = db[log_collection] + return collection.find_one({"filename": filename}) is not None + +def log_processed_file(filename, log_collection="ingested_files", db_name="vectors_db"): + client = get_mongo_client() + db = client.get_database(db_name) + collection = db[log_collection] + collection.insert_one({"filename": filename, "processed_at": 1}) # keeping it simple diff --git a/backend/src/rag/store.py b/backend/src/rag/store.py index b09af99..1ba8628 100644 --- a/backend/src/rag/store.py +++ b/backend/src/rag/store.py @@ -1,18 +1,7 @@ -import os -from pymongo import MongoClient -from .embeddings import get_embeddings_batch - -def get_mongo_client(): - uri = os.environ.get("MONGO_URI") - if not uri: - raise ValueError("MONGO_URI environment variable not set") - return MongoClient(uri) +from .embeddings import get_embeddings_batch, get_embedding +from ..mongo.vector_store import insert_rag_documents, search_rag_documents def ingest_documents(text_chunks, collection_name="rag_documents"): - client = get_mongo_client() - db = client.get_database("vectors_db") - collection = db[collection_name] - embeddings = get_embeddings_batch(text_chunks) documents = [] @@ -22,37 +11,8 @@ def ingest_documents(text_chunks, collection_name="rag_documents"): "embedding": embedding }) - if documents: - collection.insert_many(documents) - return len(documents) - return 0 + return insert_rag_documents(documents, collection_name=collection_name) def vector_search(query_text, collection_name="rag_documents", num_results=5): - from .embeddings import get_embedding query_embedding = get_embedding(query_text) - - client = get_mongo_client() - db = client.get_database("vectors_db") - collection = db[collection_name] - - pipeline = [ - { - "$vectorSearch": { - "index": "vector_index", - "path": "embedding", - "queryVector": query_embedding, - "numCandidates": num_results * 10, - "limit": num_results - } - }, - { - "$project": { - "_id": 0, - "text": 1, - "score": { "$meta": "vectorSearchScore" } - } - } - ] - - results = list(collection.aggregate(pipeline)) - return results + return search_rag_documents(query_embedding, collection_name=collection_name, num_results=num_results)