import os import sys import argparse from pathlib import Path sys .path .append (os .path .join (os .path .dirname (__file__ ),'..')) from dotenv import load_dotenv load_dotenv () from src .rag .ingest import process_file from src .rag .store import ingest_documents from src .mongo .metadata import is_file_processed ,log_processed_file def populate_from_dataset (dataset_dir ,category =None ): dataset_path =Path (dataset_dir ) if not dataset_path .exists (): print (f"Dataset directory not found: {dataset_dir }") return print (f"Scanning {dataset_dir }...") if category : print (f"Category: {category }") total_chunks =0 files_processed =0 for file_path in dataset_path .glob ('*'): if file_path .is_file ()and file_path .suffix .lower ()in ['.csv','.pdf','.txt','.xlsx']: if is_file_processed (file_path .name ): print (f"Skipping {file_path .name } (already processed)") continue print (f"Processing {file_path .name }...") try : chunks =process_file (str (file_path )) if chunks : count =ingest_documents (chunks ,source_file =file_path .name ,category =category ) print (f" Ingested {count } chunks.") if count >0 : log_processed_file (file_path .name ,category =category ,chunk_count =count ) total_chunks +=count files_processed +=1 else : print (" No text found/extracted.") except Exception as e : print (f" Error processing file: {e }") print (f"\nFinished! Processed {files_processed } files. Total chunks ingested: {total_chunks }") if __name__ =="__main__": parser =argparse .ArgumentParser (description ="Populate vector database from dataset files") parser .add_argument ("--category","-c",type =str ,help ="Category to assign to ingested documents") parser .add_argument ("--dir","-d",type =str ,default =None ,help ="Dataset directory path") args =parser .parse_args () if args .dir : dataset_dir =args .dir else : dataset_dir =os .path .join (os .path .dirname (__file__ ),'../dataset') populate_from_dataset (dataset_dir ,category =args .category )