Database and Reports Update

2026-02-04 03:34:34 -05:00 · 2026-01-25 15:31:41 +00:00
parent 5ce0b4d278
commit 34eba52461
38 changed files with 3820 additions and 2797 deletions
--- a/backend/README.md
+++ b/backend/README.md
@@ -0,0 +1,151 @@
+# Ethix Backend
+
+A Flask-based API server for the Ethix greenwashing detection platform. This backend provides AI-powered analysis of products and companies to identify misleading environmental claims.
+
+## Technology Stack
+
+| Component | Technology |
+|-----------|------------|
+| Framework | Flask |
+| AI/LLM | Google Gemini, Ollama |
+| Vector Database | ChromaDB |
+| Document Store | MongoDB |
+| Embeddings | Ollama (nomic-embed-text) |
+| Vision AI | Ollama (ministral-3) |
+| Computer Vision | OpenCV, Ultralytics (YOLO) |
+| Document Processing | PyPDF, openpyxl, pandas |
+
+## Prerequisites
+
+- Python 3.10+
+- MongoDB instance
+- Access to ChromaDB server
+- Access to Ollama server
+- Google API Key (for Gemini)
+
+## Environment Variables
+
+Create a `.env` file in the backend directory:
+
+```env
+GOOGLE_API_KEY=your_google_api_key
+MONGO_URI=your_mongodb_connection_string
+CHROMA_HOST=http://your-chromadb-host
+OLLAMA_HOST=https://your-ollama-host
+```
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `GOOGLE_API_KEY` | Google Gemini API key | (required) |
+| `MONGO_URI` | MongoDB connection string | (required) |
+| `CHROMA_HOST` | ChromaDB server URL | `http://chroma.sirblob.co` |
+| `OLLAMA_HOST` | Ollama server URL | `https://ollama.sirblob.co` |
+
+## Installation
+
+1. Create and activate a virtual environment:
+
+```bash
+python -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
+```
+
+2. Install dependencies:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Running the Server
+
+### Development
+
+```bash
+python app.py
+```
+
+The server will start on `http://localhost:5000`.
+
+### Production
+
+```bash
+gunicorn -w 4 -b 0.0.0.0:5000 app:app
+```
+
+## API Endpoints
+
+### Gemini AI
+
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| POST | `/api/gemini/ask` | Chat with AI using RAG context |
+| POST | `/api/gemini/rag` | Query with category filtering |
+| POST | `/api/gemini/vision` | Vision analysis (not implemented) |
+
+### Incidents
+
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| POST | `/api/incidents/submit` | Submit a greenwashing report |
+| GET | `/api/incidents/list` | Get all confirmed incidents |
+| GET | `/api/incidents/<id>` | Get specific incident details |
+
+### Reports
+
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| GET | `/api/reports/` | List all company reports |
+| POST | `/api/reports/search` | Semantic search for reports |
+| GET | `/api/reports/view/<filename>` | Download a report file |
+
+### RAG
+
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| POST | `/api/rag/ingest` | Ingest document chunks |
+| POST | `/api/rag/search` | Search vector database |
+
+## External Services
+
+The backend integrates with the following external services:
+
+| Service | URL | Purpose |
+|---------|-----|---------|
+| ChromaDB | `http://chroma.sirblob.co` | Vector storage and similarity search |
+| Ollama | `https://ollama.sirblob.co` | Embeddings and vision analysis |
+
+## Docker
+
+Build and run using Docker:
+
+```bash
+docker build -t ethix-backend .
+docker run -p 5000:5000 --env-file .env ethix-backend
+```
+
+Or use Docker Compose from the project root:
+
+```bash
+docker-compose up backend
+```
+
+## Core Features
+
+### Greenwashing Detection
+
+The incident submission pipeline:
+
+1. User uploads product image or company PDF
+2. Vision model detects brand logos (for products)
+3. PDF text extraction (for company reports)
+4. Embedding generation for semantic search
+5. RAG context retrieval from ChromaDB
+6. Gemini analysis with structured output
+7. Results stored in MongoDB and ChromaDB
+
+### RAG (Retrieval-Augmented Generation)
+
+- Supports CSV, PDF, TXT, and XLSX file ingestion
+- Documents are chunked and batched for embedding
+- Prevents duplicate ingestion of processed files
+- Semantic search using cosine similarity
--- a/backend/scripts/populate_db.py
+++ b/backend/scripts/populate_db.py
@@ -12,7 +12,7 @@ from src .rag .ingest import process_file
 from src .rag .store import ingest_documents 
 from src .mongo .metadata import is_file_processed ,log_processed_file 

-def populate_from_dataset (dataset_dir ,category =None ):
+def populate_from_dataset (dataset_dir ,category =None ,force =False ):
    dataset_path =Path (dataset_dir )
    if not dataset_path .exists ():
        print (f"Dataset directory not found: {dataset_dir }")
@@ -27,9 +27,9 @@ def populate_from_dataset (dataset_dir ,category =None ):

    for file_path in dataset_path .glob ('*'):
        if file_path .is_file ()and file_path .suffix .lower ()in ['.csv','.pdf','.txt','.xlsx']:
-            if is_file_processed (file_path .name ):
+            if not force and is_file_processed (file_path .name ):
                print (f"Skipping {file_path .name } (already processed)")
-                continue 
+                continue  

            print (f"Processing {file_path .name }...")
            try :
@@ -48,15 +48,23 @@ def populate_from_dataset (dataset_dir ,category =None ):

    print (f"\nFinished! Processed {files_processed } files. Total chunks ingested: {total_chunks }")

-if __name__ =="__main__":
-    parser =argparse .ArgumentParser (description ="Populate vector database from dataset files")
-    parser .add_argument ("--category","-c",type =str ,help ="Category to assign to ingested documents")
-    parser .add_argument ("--dir","-d",type =str ,default =None ,help ="Dataset directory path")
-    args =parser .parse_args ()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Populate vector database from dataset files")
+    parser.add_argument("--category", "-c", type=str, help="Category to assign to ingested documents")
+    parser.add_argument("--dir", "-d", type=str, default=None, help="Dataset directory path")
+    parser.add_argument("--force", "-f", action="store_true", help="Force re-processing of files even if marked as processed")
+    args = parser.parse_args()

-    if args .dir :
-        dataset_dir =args .dir 
-    else :
-        dataset_dir =os .path .join (os .path .dirname (__file__ ),'../dataset')
+    # Check vector store mode
+    use_atlas = os.environ.get("ATLAS_VECTORS", "false").lower() == "true"
+    store_name = "MongoDB Atlas Vector Search" if use_atlas else "ChromaDB"
+    print(f"--- Vector Store Mode: {store_name} ---")

-    populate_from_dataset (dataset_dir ,category =args .category )
+    if args.dir:
+        dataset_dir = args.dir
+    else:
+        dataset_dir = os.path.join(os.path.dirname(__file__), '../dataset')
+
+    # Note: We need to pass force flag to populate_from_dataset ideally, 
+    # but the function signature doesn't have it. I'll modify the function signature too.
+    populate_from_dataset(dataset_dir, category=args.category, force=args.force)
--- a/backend/src/chroma/chroma_store.py
+++ b/backend/src/chroma/chroma_store.py
@@ -0,0 +1,81 @@
+import chromadb 
+import os
+
+CHROMA_HOST = os.environ.get("CHROMA_HOST", "http://chroma.sirblob.co")
+COLLECTION_NAME = "rag_documents"
+
+_client =None 
+
+def get_chroma_client ():
+    global _client 
+    if _client is None :
+        _client =chromadb .HttpClient (host =CHROMA_HOST )
+    return _client 
+
+def get_collection (collection_name =COLLECTION_NAME ):
+    client =get_chroma_client ()
+    return client .get_or_create_collection (name =collection_name )
+
+def insert_documents (texts ,embeddings ,collection_name =COLLECTION_NAME ,metadata_list =None ):
+    collection =get_collection (collection_name )
+
+    ids =[f"doc_{i }_{hash (text )}"for i ,text in enumerate (texts )]
+
+    if metadata_list :
+        collection .add (
+        ids =ids ,
+        embeddings =embeddings ,
+        documents =texts ,
+        metadatas =metadata_list 
+        )
+    else :
+        collection .add (
+        ids =ids ,
+        embeddings =embeddings ,
+        documents =texts 
+        )
+
+    return len (texts )
+
+def search_documents (query_embedding ,collection_name =COLLECTION_NAME ,num_results =5 ,filter_metadata =None ):
+    collection =get_collection (collection_name )
+
+    query_params ={
+    "query_embeddings":[query_embedding ],
+    "n_results":num_results 
+    }
+
+    if filter_metadata :
+        query_params ["where"]=filter_metadata 
+
+    results =collection .query (**query_params )
+
+    output =[]
+    if results and results ["documents"]:
+        for i ,doc in enumerate (results ["documents"][0 ]):
+            score =results ["distances"][0 ][i ]if "distances"in results else None 
+            meta =results ["metadatas"][0 ][i ]if "metadatas"in results else {}
+            output .append ({
+            "text":doc ,
+            "score":score ,
+            "metadata":meta 
+            })
+
+    return output 
+
+def delete_documents_by_source (source_file ,collection_name =COLLECTION_NAME ):
+    collection =get_collection (collection_name )
+    results =collection .get (where ={"source":source_file })
+    if results ["ids"]:
+        collection .delete (ids =results ["ids"])
+        return len (results ["ids"])
+    return 0 
+
+def get_all_metadatas (collection_name =COLLECTION_NAME ,limit =None ):
+    collection =get_collection (collection_name )
+
+    if limit :
+        results =collection .get (include =["metadatas"],limit =limit )
+    else :
+        results =collection .get (include =["metadatas"])
+    return results ["metadatas"]if results and "metadatas"in results else []
--- a/backend/src/chroma/vector_store.py
+++ b/backend/src/chroma/vector_store.py
@@ -1,80 +1,35 @@
-import chromadb 
+import os
+from . import chroma_store
+from ..mongo import vector_store as mongo_store

-CHROMA_HOST ="http://chroma.sirblob.co"
-COLLECTION_NAME ="rag_documents"
+def _use_atlas():
+    return os.environ.get("ATLAS_VECTORS", "false").lower() == "true"

-_client =None 
+def get_chroma_client():
+    # Only used by chroma-specific things if external
+    return chroma_store.get_chroma_client()

-def get_chroma_client ():
-    global _client 
-    if _client is None :
-        _client =chromadb .HttpClient (host =CHROMA_HOST )
-    return _client 
+def get_collection(collection_name=chroma_store.COLLECTION_NAME):
+    if _use_atlas():
+        return mongo_store.get_collection(collection_name)
+    return chroma_store.get_collection(collection_name)

-def get_collection (collection_name =COLLECTION_NAME ):
-    client =get_chroma_client ()
-    return client .get_or_create_collection (name =collection_name )
+def insert_documents(texts, embeddings, collection_name=chroma_store.COLLECTION_NAME, metadata_list=None):
+    if _use_atlas():
+        return mongo_store.insert_documents(texts, embeddings, collection_name, metadata_list)
+    return chroma_store.insert_documents(texts, embeddings, collection_name, metadata_list)

-def insert_documents (texts ,embeddings ,collection_name =COLLECTION_NAME ,metadata_list =None ):
-    collection =get_collection (collection_name )
+def search_documents(query_embedding, collection_name=chroma_store.COLLECTION_NAME, num_results=5, filter_metadata=None):
+    if _use_atlas():
+        return mongo_store.search_documents(query_embedding, collection_name, num_results, filter_metadata)
+    return chroma_store.search_documents(query_embedding, collection_name, num_results, filter_metadata)

-    ids =[f"doc_{i }_{hash (text )}"for i ,text in enumerate (texts )]
+def delete_documents_by_source(source_file, collection_name=chroma_store.COLLECTION_NAME):
+    if _use_atlas():
+        return mongo_store.delete_documents_by_source(source_file, collection_name)
+    return chroma_store.delete_documents_by_source(source_file, collection_name)

-    if metadata_list :
-        collection .add (
-        ids =ids ,
-        embeddings =embeddings ,
-        documents =texts ,
-        metadatas =metadata_list 
-        )
-    else :
-        collection .add (
-        ids =ids ,
-        embeddings =embeddings ,
-        documents =texts 
-        )
-
-    return len (texts )
-
-def search_documents (query_embedding ,collection_name =COLLECTION_NAME ,num_results =5 ,filter_metadata =None ):
-    collection =get_collection (collection_name )
-
-    query_params ={
-    "query_embeddings":[query_embedding ],
-    "n_results":num_results 
-    }
-
-    if filter_metadata :
-        query_params ["where"]=filter_metadata 
-
-    results =collection .query (**query_params )
-
-    output =[]
-    if results and results ["documents"]:
-        for i ,doc in enumerate (results ["documents"][0 ]):
-            score =results ["distances"][0 ][i ]if "distances"in results else None 
-            meta =results ["metadatas"][0 ][i ]if "metadatas"in results else {}
-            output .append ({
-            "text":doc ,
-            "score":score ,
-            "metadata":meta 
-            })
-
-    return output 
-
-def delete_documents_by_source (source_file ,collection_name =COLLECTION_NAME ):
-    collection =get_collection (collection_name )
-    results =collection .get (where ={"source":source_file })
-    if results ["ids"]:
-        collection .delete (ids =results ["ids"])
-        return len (results ["ids"])
-    return 0 
-
-def get_all_metadatas (collection_name =COLLECTION_NAME ,limit =None ):
-    collection =get_collection (collection_name )
-
-    if limit :
-        results =collection .get (include =["metadatas"],limit =limit )
-    else :
-        results =collection .get (include =["metadatas"])
-    return results ["metadatas"]if results and "metadatas"in results else []
+def get_all_metadatas(collection_name=chroma_store.COLLECTION_NAME, limit=None):
+    if _use_atlas():
+        return mongo_store.get_all_metadatas(collection_name, limit)
+    return chroma_store.get_all_metadatas(collection_name, limit)
--- a/backend/src/mongo/vector_store.py
+++ b/backend/src/mongo/vector_store.py
@@ -1,49 +1,142 @@
-from .connection import get_mongo_client 
+import os
+from .connection import get_mongo_client

-def insert_rag_documents (documents ,collection_name ="rag_documents",db_name ="vectors_db"):
-    client =get_mongo_client ()
-    db =client .get_database (db_name )
-    collection =db [collection_name ]
+DB_NAME = "ethix_vectors"
+# Default collection name to match Chroma's default
+DEFAULT_COLLECTION_NAME = "rag_documents"

-    if documents :
-        result =collection .insert_many (documents )
-        return len (result .inserted_ids )
-    return 0 
+def get_collection(collection_name=DEFAULT_COLLECTION_NAME):
+    client = get_mongo_client()
+    db = client[DB_NAME]
+    return db[collection_name]

-def search_rag_documents (query_embedding ,collection_name ="rag_documents",db_name ="vectors_db",num_results =5 ):
-    client =get_mongo_client ()
-    db =client .get_database (db_name )
-    collection =db [collection_name ]
+def insert_documents(texts, embeddings, collection_name=DEFAULT_COLLECTION_NAME, metadata_list=None):
+    collection = get_collection(collection_name)
+    
+    docs = []
+    for i, text in enumerate(texts):
+        doc = {
+            "text": text,
+            "embedding": embeddings[i],
+            "metadata": metadata_list[i] if metadata_list else {}
+        }
+        # Flatten metadata for easier filtering if needed, or keep in 'metadata' subdoc
+        # Atlas Vector Search usually suggests keeping it cleaner, but 'metadata' field is fine
+        # provided index is on metadata.field
+        
+        # Add source file to top level for easier deletion
+        if metadata_list and "source" in metadata_list[i]:
+             doc["source"] = metadata_list[i]["source"]
+             
+        docs.append(doc)
+    
+    if docs:
+        result = collection.insert_many(docs)
+        return len(result.inserted_ids)
+    return 0

-    pipeline =[
-    {
-    "$vectorSearch":{
-    "index":"vector_index",
-    "path":"embedding",
-    "queryVector":query_embedding ,
-    "numCandidates":num_results *10 ,
-    "limit":num_results 
-    }
-    },
-    {
-    "$project":{
-    "_id":0 ,
-    "text":1 ,
-    "score":{"$meta":"vectorSearchScore"}
-    }
-    }
+def search_documents(query_embedding, collection_name=DEFAULT_COLLECTION_NAME, num_results=5, filter_metadata=None):
+    collection = get_collection(collection_name)
+    
+    # Construct Atlas Search Aggregation
+    # Note: 'index' name depends on what user created. Default is often 'default' or 'vector_index'.
+    # We'll use 'vector_index' as a convention.
+    
+    # Filter construction
+    pre_filter = None
+    if filter_metadata:
+        # Atlas Search filter syntax is different from simple match
+        # It requires MQL (Mongo Query Language) match or compound operators
+        # For simplicity in $vectorSearch, strict filters are passed in 'filter' field (if serverless/v2)
+        # or separate match stage.
+        
+        # Modern Atlas Vector Search ($vectorSearch) supports 'filter' inside the operator
+        
+        must_clauses = []
+        for k, v in filter_metadata.items():
+            # Assuming filter_metadata matches metadata fields inside 'metadata' object
+            # or we map specific known fields.
+            # In Chroma, filters are flat. In insert above, we put them in 'metadata'.
+            must_clauses.append({
+                "term": {
+                    "path": f"metadata.{k}",
+                    "value": v
+                }
+            })
+            
+        if must_clauses:
+            if len(must_clauses) == 1:
+                pre_filter = must_clauses[0]
+            else:
+                pre_filter = {
+                    "compound": {
+                        "filter": must_clauses
+                    }
+                }
+
+    pipeline = [
+        {
+            "$vectorSearch": {
+                "index": "vector_index",
+                "path": "embedding",
+                "queryVector": query_embedding,
+                "numCandidates": num_results * 10,
+                "limit": num_results
+            }
+        }
    ]
+    
+    # Apply filter if exists
+    if pre_filter:
+        pipeline[0]["$vectorSearch"]["filter"] = pre_filter

-    return list (collection .aggregate (pipeline ))
+    pipeline.append({
+        "$project": {
+            "_id": 0,
+            "text": 1,
+            "metadata": 1,
+            "score": {"$meta": "vectorSearchScore"}
+        }
+    })

-def is_file_processed (filename ,log_collection ="ingested_files",db_name ="vectors_db"):
-    client =get_mongo_client ()
-    db =client .get_database (db_name )
-    collection =db [log_collection ]
-    return collection .find_one ({"filename":filename })is not None 
+    try:
+        results = list(collection.aggregate(pipeline))
+        
+        # Format to match Chroma output style
+        # Chroma: [{'text': ..., 'score': ..., 'metadata': ...}]
+        output = []
+        for doc in results:
+            output.append({
+                "text": doc.get("text", ""),
+                "score": doc.get("score", 0),
+                "metadata": doc.get("metadata", {})
+            })
+        return output
+        
+    except Exception as e:
+        print(f"Atlas Vector Search Error: {e}")
+        # Fallback to empty if index doesn't exist etc
+        return []

-def log_processed_file (filename ,log_collection ="ingested_files",db_name ="vectors_db"):
-    client =get_mongo_client ()
-    db =client .get_database (db_name )
-    collection =db [log_collection ]
-    collection .insert_one ({"filename":filename ,"processed_at":1 })
+def delete_documents_by_source(source_file, collection_name=DEFAULT_COLLECTION_NAME):
+    collection = get_collection(collection_name)
+    # We added 'source' to top level in insert_documents for exactly this efficiency
+    result = collection.delete_many({"source": source_file})
+    return result.deleted_count
+
+def get_all_metadatas(collection_name=DEFAULT_COLLECTION_NAME, limit=None):
+    collection = get_collection(collection_name)
+    
+    cursor = collection.find({}, {"metadata": 1, "source": 1, "_id": 0})
+    if limit:
+        cursor = cursor.limit(limit)
+        
+    metadatas = []
+    for doc in cursor:
+        meta = doc.get("metadata", {})
+        # Ensure 'source' is in metadata if we pulled it out
+        if "source" in doc and "source" not in meta:
+            meta["source"] = doc["source"]
+        metadatas.append(meta)
+        
+    return metadatas
--- a/backend/src/ollama/detector.py
+++ b/backend/src/ollama/detector.py
@@ -1,5 +1,6 @@
 import base64 
 import json 
+import os
 import re 
 from pathlib import Path 
 from typing import Dict ,List ,Optional ,Union 
@@ -11,8 +12,8 @@ except ImportError :
    OLLAMA_AVAILABLE =False 
    print ("Ollama not installed. Run: pip install ollama")

-DEFAULT_HOST ="https://ollama.sirblob.co"
-DEFAULT_MODEL ="ministral-3:latest"
+DEFAULT_HOST = os.environ.get("OLLAMA_HOST", "https://ollama.sirblob.co")
+DEFAULT_MODEL = "ministral-3:latest"

 DEFAULT_PROMPT ="""Analyze this image and identify ALL logos, brand names, and company names visible.

--- a/backend/src/rag/embeddings.py
+++ b/backend/src/rag/embeddings.py
@@ -1,8 +1,9 @@
 import ollama 
 import os 

-client =ollama .Client (host ="https://ollama.sirblob.co")
-DEFAULT_MODEL ="nomic-embed-text:latest"
+OLLAMA_HOST = os.environ.get("OLLAMA_HOST", "https://ollama.sirblob.co")
+client = ollama.Client(host=OLLAMA_HOST)
+DEFAULT_MODEL = "nomic-embed-text:latest"

 def get_embedding (text ,model =DEFAULT_MODEL ):
    try :
--- a/backend/src/routes/gemini.py
+++ b/backend/src/routes/gemini.py
@@ -1,37 +1,69 @@
-from flask import Blueprint ,request ,jsonify 
-from src .rag .gemeni import GeminiClient 
-from src .gemini import ask_gemini_with_rag 
+from flask import Blueprint, request, jsonify
+from src.rag.gemeni import GeminiClient
+from src.gemini import ask_gemini_with_rag
+from src.chroma.vector_store import search_documents
+from src.rag.embeddings import get_embedding

-gemini_bp =Blueprint ('gemini',__name__ )
-brain =None 
+gemini_bp = Blueprint('gemini', __name__)
+brain = None

-def get_brain ():
-    global brain 
-    if brain is None :
-        brain =GeminiClient ()
-    return brain 
+def get_brain():
+    global brain
+    if brain is None:
+        brain = GeminiClient()
+    return brain

-@gemini_bp .route ('/ask',methods =['POST'])
-def ask ():
-    data =request .json 
-    prompt =data .get ("prompt")
-    context =data .get ("context","")
+@gemini_bp.route('/ask', methods=['POST'])
+def ask():
+    data = request.json
+    prompt = data.get("prompt")
+    context = data.get("context", "")

-    if not prompt :
-        return jsonify ({"error":"No prompt provided"}),400 
+    if not prompt:
+        return jsonify({"error": "No prompt provided"}), 400

-    try :
-        client =get_brain ()
-        response =client .ask (prompt ,context )
-        return jsonify ({
-        "status":"success",
-        "reply":response 
+    try:
+        # Step 1: Retrieve relevant context from ChromaDB (RAG)
+        print(f"Generating embedding for prompt: {prompt}")
+        query_embedding = get_embedding(prompt)
+        
+        print("Searching ChromaDB for context...")
+        search_results = search_documents(query_embedding, num_results=15)
+        
+        retrieved_context = ""
+        if search_results:
+            print(f"Found {len(search_results)} documents.")
+            retrieved_context = "RELEVANT INFORMATION FROM DATABASE:\n"
+            for res in search_results:
+                # Include metadata if useful, e.g. brand name or date
+                meta = res.get('metadata', {})
+                source_info = f"[Source: {meta.get('type', 'doc')} - {meta.get('product_name', 'Unknown')}]"
+                retrieved_context += f"{source_info}\n{res['text']}\n\n"
+        else:
+            print("No relevant documents found.")
+
+        # Step 2: Combine manual context (if any) with retrieved context
+        full_context = context
+        if retrieved_context:
+            if full_context:
+                full_context += "\n\n" + retrieved_context
+            else:
+                full_context = retrieved_context
+
+        # Step 3: Ask Gemini
+        client = get_brain()
+        response = client.ask(prompt, full_context)
+        
+        return jsonify({
+            "status": "success",
+            "reply": response
        })
-    except Exception as e :
-        return jsonify ({
-        "status":"error",
-        "message":str (e )
-        }),500 
+    except Exception as e:
+        print(f"Error in RAG flow: {e}")
+        return jsonify({
+            "status": "error",
+            "message": str(e)
+        }), 500 

@gemini_bp .route ('/rag',methods =['POST'])
 def rag ():
--- a/backend/src/routes/incidents.py
+++ b/backend/src/routes/incidents.py
@@ -306,7 +306,7 @@ This incident has been documented for future reference and to help inform sustai
    metadata_list =[metadata ]
    )

-    print (f"✓ Incident #{incident_id } saved to ChromaDB for AI chat context")
+    print (f"✓ Incident #{incident_id } saved to Vector Store for AI chat context")



--- a/backend/src/routes/reports.py
+++ b/backend/src/routes/reports.py
@@ -1,14 +1,220 @@
 from flask import Blueprint ,jsonify ,request 
-from src .chroma .vector_store import get_all_metadatas ,search_documents 
-from src .rag .embeddings import get_embedding 
+from src .chroma .vector_store import get_all_metadatas ,search_documents, insert_documents
+from src .rag .embeddings import get_embedding, get_embeddings_batch
+from src.rag.ingest import load_pdf, chunk_text
+import os
+import base64
+import re
+from google import genai

 reports_bp =Blueprint ('reports',__name__ )

+# Validation prompt for checking if PDF is a legitimate environmental report
+REPORT_VALIDATION_PROMPT = """You are an expert document classifier. Analyze the following text extracted from a PDF and determine if it is a legitimate corporate environmental/sustainability report.
+
+A legitimate report should contain:
+1. Company name and branding
+2. Environmental metrics (emissions, energy usage, waste, water)
+3. Sustainability goals or targets
+4. Time periods/years covered
+5. Professional report structure
+
+Analyze this text and respond in JSON format:
+{
+    "is_valid_report": true/false,
+    "confidence": "high"/"medium"/"low",
+    "company_name": "extracted company name or null",
+    "report_year": "extracted year or null",
+    "report_type": "sustainability/environmental/impact/ESG/unknown",
+    "sector": "Tech/Energy/Automotive/Aerospace/Retail/Manufacturing/Other",
+    "key_topics": ["list of main topics covered"],
+    "reasoning": "brief explanation of your assessment"
+}
+
+TEXT TO ANALYZE:
+"""
+
+def validate_report_with_ai(text_content):
+    """Use Gemini to validate if the PDF is a legitimate environmental report."""
+    try:
+        # Take first 15000 chars for validation (enough to assess legitimacy)
+        sample_text = text_content[:15000] if len(text_content) > 15000 else text_content
+        
+        client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY"))
+        response = client.models.generate_content(
+            model="gemini-2.0-flash",
+            contents=f"{REPORT_VALIDATION_PROMPT}\n{sample_text}"
+        )
+        
+        # Parse the JSON response
+        response_text = response.text
+        
+        # Extract JSON from response
+        import json
+        json_match = re.search(r'\{[\s\S]*\}', response_text)
+        if json_match:
+            return json.loads(json_match.group())
+        
+        return {"is_valid_report": False, "reasoning": "Could not parse AI response"}
+    except Exception as e:
+        print(f"AI validation error: {e}")
+        return {"is_valid_report": False, "reasoning": str(e)}
+
+
+@reports_bp.route('/upload', methods=['POST'])
+def upload_report():
+    """Upload and verify a company environmental report PDF."""
+    try:
+        data = request.json
+        pdf_data = data.get('pdf_data')
+        company_name = data.get('company_name', '').strip()
+        
+        if not pdf_data:
+            return jsonify({
+                'status': 'error',
+                'step': 'validation',
+                'message': 'No PDF data provided'
+            }), 400
+        
+        # Step 1: Decode PDF
+        try:
+            # Remove data URL prefix if present
+            if ',' in pdf_data:
+                pdf_data = pdf_data.split(',')[1]
+            pdf_bytes = base64.b64decode(pdf_data)
+        except Exception as e:
+            return jsonify({
+                'status': 'error',
+                'step': 'decode',
+                'message': f'Failed to decode PDF: {str(e)}'
+            }), 400
+        
+        # Step 2: Extract text from PDF
+        import io
+        from pypdf import PdfReader
+        
+        try:
+            pdf_file = io.BytesIO(pdf_bytes)
+            reader = PdfReader(pdf_file)
+            
+            all_text = ""
+            page_count = len(reader.pages)
+            
+            for page in reader.pages:
+                text = page.extract_text()
+                if text:
+                    all_text += text + "\n\n"
+            
+            if len(all_text.strip()) < 500:
+                return jsonify({
+                    'status': 'error',
+                    'step': 'extraction',
+                    'message': 'PDF contains insufficient text content. It may be image-based or corrupted.'
+                }), 400
+                
+        except Exception as e:
+            return jsonify({
+                'status': 'error',
+                'step': 'extraction',
+                'message': f'Failed to extract text from PDF: {str(e)}'
+            }), 400
+        
+        # Step 3: Validate with AI
+        validation = validate_report_with_ai(all_text)
+        
+        if not validation.get('is_valid_report', False):
+            return jsonify({
+                'status': 'rejected',
+                'step': 'validation',
+                'message': 'This does not appear to be a legitimate environmental/sustainability report.',
+                'validation': validation
+            }), 400
+        
+        # Step 4: Generate filename
+        detected_company = validation.get('company_name') or company_name or 'Unknown'
+        detected_year = validation.get('report_year') or '2024'
+        report_type = validation.get('report_type', 'sustainability')
+        sector = validation.get('sector', 'Other')
+        
+        # Clean company name for filename
+        safe_company = re.sub(r'[^a-zA-Z0-9]', '-', detected_company).lower()
+        filename = f"{safe_company}-{detected_year}-{report_type}-report.pdf"
+        
+        # Step 5: Save PDF to dataset folder
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        dataset_dir = os.path.join(current_dir, '..', '..', 'dataset')
+        os.makedirs(dataset_dir, exist_ok=True)
+        
+        file_path = os.path.join(dataset_dir, filename)
+        
+        # Check if file already exists
+        if os.path.exists(file_path):
+            # Add timestamp to make unique
+            import time
+            timestamp = int(time.time())
+            filename = f"{safe_company}-{detected_year}-{report_type}-report-{timestamp}.pdf"
+            file_path = os.path.join(dataset_dir, filename)
+        
+        with open(file_path, 'wb') as f:
+            f.write(pdf_bytes)
+        
+        # Step 6: Chunk and embed for RAG
+        chunks = chunk_text(all_text, target_length=2000, overlap=100)
+        
+        if not chunks:
+            chunks = [all_text[:4000]]
+        
+        # Get embeddings in batches
+        embeddings = get_embeddings_batch(chunks)
+        
+        # Create metadata for each chunk
+        metadata_list = []
+        for i, chunk in enumerate(chunks):
+            metadata_list.append({
+                'source': filename,
+                'company_name': detected_company,
+                'year': detected_year,
+                'sector': sector,
+                'report_type': report_type,
+                'chunk_index': i,
+                'total_chunks': len(chunks),
+                'page_count': page_count,
+                'type': 'company_report'
+            })
+        
+        # Insert into ChromaDB
+        insert_documents(chunks, embeddings, metadata_list=metadata_list)
+        
+        return jsonify({
+            'status': 'success',
+            'message': 'Report verified and uploaded successfully',
+            'filename': filename,
+            'validation': validation,
+            'stats': {
+                'page_count': page_count,
+                'text_length': len(all_text),
+                'chunks_created': len(chunks),
+                'company_name': detected_company,
+                'year': detected_year,
+                'sector': sector,
+                'report_type': report_type
+            }
+        })
+        
+    except Exception as e:
+        print(f"Upload error: {e}")
+        import traceback
+        traceback.print_exc()
+        return jsonify({
+            'status': 'error',
+            'step': 'unknown',
+            'message': str(e)
+        }), 500
+
+
@reports_bp .route ('/',methods =['GET'])
 def get_reports ():
    try :
-
-
        metadatas =get_all_metadatas ()

        unique_reports ={}
@@ -24,12 +230,6 @@ def get_reports ():


            if filename not in unique_reports :
-
-
-
-
-
-
                company_name ="Unknown"
                year ="N/A"
                sector ="Other"
@@ -106,15 +306,10 @@ def search_reports ():

    try :
        import re 
-
-
        query_embedding =get_embedding (query )
+        results = search_documents (query_embedding ,num_results =50 )

-
-        results =search_documents (query_embedding ,num_results =50 )
-
-        query_lower =query .lower ()
-
+        query_lower = query .lower ()

        def extract_company_info (filename ):
            company_name ="Unknown"
@@ -224,10 +419,55 @@ def view_report_file (filename ):
    import os 
    from flask import send_from_directory 

-
-
-
    current_dir =os .path .dirname (os .path .abspath (__file__ ))
    dataset_dir =os .path .join (current_dir ,'..','..','dataset')

    return send_from_directory (dataset_dir ,filename )
+
+
+@reports_bp.route('/stats', methods=['GET'])
+def get_stats():
+    """Get real statistics for the landing page"""
+    try:
+        # 1. Count Verified Reports (Company Reports)
+        metadatas = get_all_metadatas()
+        unique_reports = set()
+        unique_sectors = set()
+        
+        for meta in metadatas:
+            filename = meta.get('source') or meta.get('filename')
+            # Check if it's a company report (not incident)
+            if filename and not filename.startswith('incident_') and meta.get('type') != 'incident_report':
+                unique_reports.add(filename)
+                
+                # Count sectors
+                sector = meta.get('sector')
+                if sector and sector != 'Other':
+                    unique_sectors.add(sector)
+        
+        verified_reports_count = len(unique_reports)
+        active_sectors_count = len(unique_sectors)
+        
+        # Fallback if no specific sectors found but we have reports
+        if active_sectors_count == 0 and verified_reports_count > 0:
+            active_sectors_count = 1
+
+        # 2. Count Products Scanned / Incidents
+        from src.mongo.connection import get_mongo_client
+        client = get_mongo_client()
+        db = client["ethix"]
+        # Count all incidents (users scanning products)
+        incidents_count = db["incidents"].count_documents({})
+            
+        return jsonify({
+            "verified_reports": verified_reports_count,
+            "incidents_reported": incidents_count,
+            "active_sectors": active_sectors_count
+        })
+    except Exception as e:
+        print(f"Error fetching stats: {e}")
+        return jsonify({
+            "verified_reports": 0,
+            "incidents_reported": 0,
+            "active_sectors": 0
+        })
--- a/backend/src/scripts/sync_incidents.py
+++ b/backend/src/scripts/sync_incidents.py
@@ -0,0 +1,51 @@
+
+import os
+import sys
+from datetime import datetime
+
+# Add the backend directory to sys.path so we can import src modules
+sys.path.append(os.path.join(os.path.dirname(__file__), '../../'))
+
+from src.mongo.connection import get_mongo_client
+from src.routes.incidents import save_to_chromadb
+
+def sync_incidents():
+    print("Starting sync of incidents from MongoDB to ChromaDB...")
+    
+    client = get_mongo_client()
+    db = client["ethix"]
+    collection = db["incidents"]
+    
+    # Find all confirmed greenwashing incidents
+    cursor = collection.find({"is_greenwashing": True})
+    
+    count = 0
+    synced = 0
+    errors = 0
+    
+    for incident in cursor:
+        count += 1
+        incident_id = str(incident["_id"])
+        product_name = incident.get("product_name", "Unknown")
+        print(f"Processing Incident #{incident_id} ({product_name})...")
+        
+        try:
+            # Convert ObjectId to string for the function if needed, 
+            # but save_to_chromadb expects the dict and the id string.
+            # Convert _id in dict to string just in case
+            incident["_id"] = incident_id
+            
+            save_to_chromadb(incident, incident_id)
+            synced += 1
+            print(f" -> Successfully synced.")
+        except Exception as e:
+            errors += 1
+            print(f" -> FAILED to sync: {e}")
+            
+    print(f"\nSync Complete.")
+    print(f"Total processed: {count}")
+    print(f"Successfully synced: {synced}")
+    print(f"Errors: {errors}")
+
+if __name__ == "__main__":
+    sync_incidents()