mirror of
https://github.com/SirBlobby/Hoya26.git
synced 2026-02-04 03:34:34 -05:00
53 lines
1.8 KiB
Python
53 lines
1.8 KiB
Python
import os
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Add backend directory to path so we can import src
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
|
|
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
|
|
from src.rag.ingest import process_file
|
|
from src.rag.store import ingest_documents
|
|
from src.mongo.vector_store import is_file_processed, log_processed_file
|
|
|
|
def populate_from_dataset(dataset_dir):
|
|
dataset_path = Path(dataset_dir)
|
|
if not dataset_path.exists():
|
|
print(f"Dataset directory not found: {dataset_dir}")
|
|
return
|
|
|
|
print(f"Scanning {dataset_dir}...")
|
|
|
|
total_chunks = 0
|
|
files_processed = 0
|
|
|
|
for file_path in dataset_path.glob('*'):
|
|
if file_path.is_file() and file_path.suffix.lower() in ['.csv', '.pdf']:
|
|
if is_file_processed(file_path.name):
|
|
print(f"Skipping {file_path.name} (already processed)")
|
|
continue
|
|
|
|
print(f"Processing {file_path.name}...")
|
|
try:
|
|
chunks = process_file(str(file_path))
|
|
if chunks:
|
|
count = ingest_documents(chunks)
|
|
print(f" Ingested {count} chunks.")
|
|
if count > 0:
|
|
log_processed_file(file_path.name)
|
|
total_chunks += count
|
|
files_processed += 1
|
|
else:
|
|
print(" No text found/extracted.")
|
|
except Exception as e:
|
|
print(f" Error processing file: {e}")
|
|
|
|
print(f"\nFinished! Processed {files_processed} files. Total chunks ingested: {total_chunks}")
|
|
|
|
if __name__ == "__main__":
|
|
# Assuming run from backend/
|
|
dataset_dir = os.path.join(os.path.dirname(__file__), '../dataset')
|
|
populate_from_dataset(dataset_dir)
|