Hoya26/backend/scripts/populate_db.py

import os
import sys
from pathlib import Path

# Add backend directory to path so we can import src
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))

from dotenv import load_dotenv
load_dotenv()

from src.rag.ingest import process_file
from src.rag.store import ingest_documents
from src.mongo.vector_store import is_file_processed, log_processed_file

def populate_from_dataset(dataset_dir):
    dataset_path = Path(dataset_dir)
    if not dataset_path.exists():
        print(f"Dataset directory not found: {dataset_dir}")
        return

    print(f"Scanning {dataset_dir}...")

    total_chunks = 0
    files_processed = 0

    for file_path in dataset_path.glob('*'):
        if file_path.is_file() and file_path.suffix.lower() in ['.csv', '.pdf']:
            if is_file_processed(file_path.name):
                print(f"Skipping {file_path.name} (already processed)")
                continue

            print(f"Processing {file_path.name}...")
            try:
                chunks = process_file(str(file_path))
                if chunks:
                    count = ingest_documents(chunks)
                    print(f"  Ingested {count} chunks.")
                    if count > 0:
                        log_processed_file(file_path.name)
                        total_chunks += count
                        files_processed += 1
                else:
                    print("  No text found/extracted.")
            except Exception as e:
                print(f"  Error processing file: {e}")

    print(f"\nFinished! Processed {files_processed} files. Total chunks ingested: {total_chunks}")

if __name__ == "__main__":
    # Assuming run from backend/
    dataset_dir = os.path.join(os.path.dirname(__file__), '../dataset')
    populate_from_dataset(dataset_dir)