Populate DB

This commit is contained in:
2026-01-24 05:08:01 +00:00
parent aad7e6e08d
commit d145f7e94c
4 changed files with 113 additions and 44 deletions

View File

@@ -0,0 +1,52 @@
import os
import sys
from pathlib import Path
# Add backend directory to path so we can import src
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from dotenv import load_dotenv
load_dotenv()
from src.rag.ingest import process_file
from src.rag.store import ingest_documents
from src.mongo.vector_store import is_file_processed, log_processed_file
def populate_from_dataset(dataset_dir):
dataset_path = Path(dataset_dir)
if not dataset_path.exists():
print(f"Dataset directory not found: {dataset_dir}")
return
print(f"Scanning {dataset_dir}...")
total_chunks = 0
files_processed = 0
for file_path in dataset_path.glob('*'):
if file_path.is_file() and file_path.suffix.lower() in ['.csv', '.pdf']:
if is_file_processed(file_path.name):
print(f"Skipping {file_path.name} (already processed)")
continue
print(f"Processing {file_path.name}...")
try:
chunks = process_file(str(file_path))
if chunks:
count = ingest_documents(chunks)
print(f" Ingested {count} chunks.")
if count > 0:
log_processed_file(file_path.name)
total_chunks += count
files_processed += 1
else:
print(" No text found/extracted.")
except Exception as e:
print(f" Error processing file: {e}")
print(f"\nFinished! Processed {files_processed} files. Total chunks ingested: {total_chunks}")
if __name__ == "__main__":
# Assuming run from backend/
dataset_dir = os.path.join(os.path.dirname(__file__), '../dataset')
populate_from_dataset(dataset_dir)