Populate DB

2026-02-04 03:34:34 -05:00 · 2026-01-24 05:08:01 +00:00
parent aad7e6e08d
commit d145f7e94c
4 changed files with 113 additions and 44 deletions
--- a/backend/scripts/populate_db.py
+++ b/backend/scripts/populate_db.py
@@ -0,0 +1,52 @@
+import os
+import sys
+from pathlib import Path
+
+# Add backend directory to path so we can import src
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+
+from dotenv import load_dotenv
+load_dotenv()
+
+from src.rag.ingest import process_file
+from src.rag.store import ingest_documents
+from src.mongo.vector_store import is_file_processed, log_processed_file
+
+def populate_from_dataset(dataset_dir):
+    dataset_path = Path(dataset_dir)
+    if not dataset_path.exists():
+        print(f"Dataset directory not found: {dataset_dir}")
+        return
+
+    print(f"Scanning {dataset_dir}...")
+    
+    total_chunks = 0
+    files_processed = 0
+
+    for file_path in dataset_path.glob('*'):
+        if file_path.is_file() and file_path.suffix.lower() in ['.csv', '.pdf']:
+            if is_file_processed(file_path.name):
+                print(f"Skipping {file_path.name} (already processed)")
+                continue
+
+            print(f"Processing {file_path.name}...")
+            try:
+                chunks = process_file(str(file_path))
+                if chunks:
+                    count = ingest_documents(chunks)
+                    print(f"  Ingested {count} chunks.")
+                    if count > 0:
+                        log_processed_file(file_path.name)
+                        total_chunks += count
+                        files_processed += 1
+                else:
+                    print("  No text found/extracted.")
+            except Exception as e:
+                print(f"  Error processing file: {e}")
+
+    print(f"\nFinished! Processed {files_processed} files. Total chunks ingested: {total_chunks}")
+
+if __name__ == "__main__":
+    # Assuming run from backend/
+    dataset_dir = os.path.join(os.path.dirname(__file__), '../dataset')
+    populate_from_dataset(dataset_dir)