Populate DB Chromadb

2026-02-04 03:34:34 -05:00 · 2026-01-24 07:52:48 +00:00
parent d145f7e94c
commit 4298368b63
10 changed files with 279 additions and 48 deletions
--- a/backend/scripts/populate_db.py
+++ b/backend/scripts/populate_db.py
@@ -1,8 +1,8 @@
 import os
 import sys
+import argparse
 from pathlib import Path

-# Add backend directory to path so we can import src
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))

 from dotenv import load_dotenv
@@ -10,21 +10,23 @@ load_dotenv()

 from src.rag.ingest import process_file
 from src.rag.store import ingest_documents
-from src.mongo.vector_store import is_file_processed, log_processed_file
+from src.mongo.metadata import is_file_processed, log_processed_file

-def populate_from_dataset(dataset_dir):
+def populate_from_dataset(dataset_dir, category=None):
    dataset_path = Path(dataset_dir)
    if not dataset_path.exists():
        print(f"Dataset directory not found: {dataset_dir}")
        return

    print(f"Scanning {dataset_dir}...")
+    if category:
+        print(f"Category: {category}")
    
    total_chunks = 0
    files_processed = 0

    for file_path in dataset_path.glob('*'):
-        if file_path.is_file() and file_path.suffix.lower() in ['.csv', '.pdf']:
+        if file_path.is_file() and file_path.suffix.lower() in ['.csv', '.pdf', '.txt', '.xlsx']:
            if is_file_processed(file_path.name):
                print(f"Skipping {file_path.name} (already processed)")
                continue
@@ -33,10 +35,10 @@ def populate_from_dataset(dataset_dir):
            try:
                chunks = process_file(str(file_path))
                if chunks:
-                    count = ingest_documents(chunks)
+                    count = ingest_documents(chunks, source_file=file_path.name, category=category)
                    print(f"  Ingested {count} chunks.")
                    if count > 0:
-                        log_processed_file(file_path.name)
+                        log_processed_file(file_path.name, category=category, chunk_count=count)
                        total_chunks += count
                        files_processed += 1
                else:
@@ -47,6 +49,14 @@ def populate_from_dataset(dataset_dir):
    print(f"\nFinished! Processed {files_processed} files. Total chunks ingested: {total_chunks}")

 if __name__ == "__main__":
-    # Assuming run from backend/
-    dataset_dir = os.path.join(os.path.dirname(__file__), '../dataset')
-    populate_from_dataset(dataset_dir)
+    parser = argparse.ArgumentParser(description="Populate vector database from dataset files")
+    parser.add_argument("--category", "-c", type=str, help="Category to assign to ingested documents")
+    parser.add_argument("--dir", "-d", type=str, default=None, help="Dataset directory path")
+    args = parser.parse_args()
+    
+    if args.dir:
+        dataset_dir = args.dir
+    else:
+        dataset_dir = os.path.join(os.path.dirname(__file__), '../dataset')
+    
+    populate_from_dataset(dataset_dir, category=args.category)