Docker Update and Fixes

2026-02-04 03:34:34 -05:00 · 2026-01-25 17:36:15 +00:00
parent b7b718d4ca
commit 295be1ed8e
21 changed files with 886 additions and 289 deletions
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -12,3 +12,4 @@ flask-cors
 ollama
 chromadb-client
 pymongo
+google-genai
--- a/backend/src/rag/gemeni.py
+++ b/backend/src/rag/gemeni.py
@@ -18,7 +18,7 @@ class GeminiClient :
    def ask (self ,prompt ,context =""):
        try :
            if context :
-                full_message =f"Use this information to answer: {context }\n\nQuestion: {prompt }"
+                full_message =f"Background Context:\n{context }\n\nUser Question: {prompt }"
            else :
                full_message =prompt 

@@ -26,7 +26,7 @@ class GeminiClient :
            model =self .model_name ,
            contents =full_message ,
            config ={
-            'system_instruction':'You are a concise sustainability assistant. Your responses must be a single short paragraph, maximum 6 sentences long. Do not use bullet points or multiple sections.'
+            'system_instruction':'You are Ethix, an expert sustainability assistant. You have access to a database including Georgetown University sustainability reports. SEARCH THE PROVIDED CONTEXT CAREFULLY. If the context contains ANY information about Georgetown University or the user\'s query, matches, or partial matches, YOU MUST USE IT to answer. Ignore irrelevant parts of the context. If no context matches, provide general expert advice. Keep responses concise (max 6 sentences).'
            }
            )
            return response .text 
--- a/backend/src/routes/gemini.py
+++ b/backend/src/routes/gemini.py
@@ -27,17 +27,53 @@ def ask():
        print(f"Generating embedding for prompt: {prompt}")
        query_embedding = get_embedding(prompt)
        
-        print("Searching ChromaDB for context...")
-        search_results = search_documents(query_embedding, num_results=15)
+        print("Searching Vector Database for context...")
+        search_results = search_documents(query_embedding, num_results=50)
        
+        # Special handling for Georgetown University queries to ensure those docs are included
+        # even if generic corporate reports outrank them in vector search.
+        if "georgetown" in prompt.lower():
+            try:
+                from src.mongo.connection import get_mongo_client
+                client = get_mongo_client()
+                # Use hardcoded DB name to match vector_store.py
+                db = client["ethix_vectors"]
+                collection = db["rag_documents"]
+                
+                # Fetch docs with Georgetown or MAP_INFO in the filename/source
+                gt_docs = list(collection.find({"source": {"$regex": "Georgetown|MAP_INFO", "$options": "i"}}).limit(30))
+                
+                if gt_docs:
+                    print(f"Direct Match: Found {len(gt_docs)} Georgetown specific documents.")
+                    for doc in gt_docs:
+                        # Normalize to match search_results format
+                        search_results.append({
+                            "text": doc.get("text", ""),
+                            "metadata": doc.get("metadata", {"source": doc.get("source", "Georgetown File")}),
+                            "score": 1.0 # High priority
+                        })
+            except Exception as e:
+                print(f"Error checking Georgetown docs: {e}")
+
        retrieved_context = ""
        if search_results:
-            print(f"Found {len(search_results)} documents.")
-            retrieved_context = "RELEVANT INFORMATION FROM DATABASE:\n"
+            print(f"Found {len(search_results)} documents (total).")
+            print("Sources found:")
+            seen_sources = set()
+            
            for res in search_results:
                # Include metadata if useful, e.g. brand name or date
                meta = res.get('metadata', {})
-                source_info = f"[Source: {meta.get('type', 'doc')} - {meta.get('product_name', 'Unknown')}]"
+                source = meta.get('source', 'unknown')
+                
+                # Deduplication of printing and adding context if exact text overlap?
+                # For now just append. LLM can handle duplication.
+                
+                if source not in seen_sources:
+                     print(f" - {source}")
+                     seen_sources.add(source)
+                
+                source_info = f"[Source: {meta.get('type', 'doc')} - {source}]"
                retrieved_context += f"{source_info}\n{res['text']}\n\n"
        else:
            print("No relevant documents found.")
--- a/backend/src/routes/incidents.py
+++ b/backend/src/routes/incidents.py
@@ -86,6 +86,7 @@ class GreenwashingAnalysis (BaseModel ):
    recommendations :str =Field (description ="What consumers should know about this case")
    key_claims :List [str ]=Field (description ="List of specific environmental claims made by the company")
    red_flags :List [str ]=Field (description ="List of red flags or concerning practices identified")
+    alternatives :List [str ]=Field (description ="List of sustainable alternatives or better choices")


 class LogoDetection (BaseModel ):
@@ -124,8 +125,11 @@ Based on this information, determine if this is a valid case of greenwashing. Co
 2. Are their eco-friendly claims vague or unsubstantiated?
 3. Is there a disconnect between their marketing and actual practices?
 4. Are they using green imagery or terms without substance?
-
-Provide your analysis in the structured format requested."""
+128: 5. Suggest better, clearer, or more sustainable alternatives if applicable.
+129: 
+130: If the provided context includes university-specific information (e.g., Georgetown University), incorporate it into your recommendations where relevant (e.g., disposal instructions, local alternatives).
+131: 
+132: Provide your analysis in the structured format requested."""


 def analyze_with_gemini (product_name :str ,user_description :str ,detected_brand :str ,
@@ -147,7 +151,7 @@ image_description :str ,context :str )->GreenwashingAnalysis :


    response =client .models .generate_content (
-    model ="gemini-3-pro-preview",
+    model ="gemini-2.0-flash-exp",
    contents =prompt ,
    config ={
    "response_mime_type":"application/json",
@@ -311,169 +315,240 @@ This incident has been documented for future reference and to help inform sustai



-@incidents_bp .route ('/submit',methods =['POST'])
-def submit_incident ():
+@incidents_bp.route('/submit', methods=['POST'])
+def submit_incident():
    """
    Submit a greenwashing incident report
    
    Expects JSON with:
    - product_name: Name of the product/company
-    - description: User's description of the misleading claim
+    - description: User's description
    - report_type: 'product' or 'company'
-    - image: Base64 encoded image (for product reports)
-    - pdf_data: Base64 encoded PDF (for company reports)
+    - image: Base64 encoded image
+    - user_id: ID of the user submitting
+    - is_public: Boolean, whether to make it public
    """
-    data =request .json 
+    data = request.json
+    if not data:
+        return jsonify({"error": "No data provided"}), 400

-    if not data :
-        return jsonify ({"error":"No data provided"}),400 
+    product_name = data.get('product_name', '').strip()
+    user_description = data.get('description', '').strip()
+    report_type = data.get('report_type', 'product')
+    image_base64 = data.get('image')
+    user_id = data.get('user_id', 'anonymous')
+    is_public = data.get('is_public', False)

-    product_name =data .get ('product_name','').strip ()
-    user_description =data .get ('description','').strip ()
-    report_type =data .get ('report_type','product')
-    image_base64 =data .get ('image')
+    if not product_name:
+        return jsonify({"error": "Product name is required"}), 400
+    
+    # Description isn't strictly required if image provides context, but good enforcement
+    if not user_description:
+         user_description = "No description provided."

-    if not product_name :
-        return jsonify ({"error":"Product name is required"}),400 
+    try:
+        detected_brand = "Unknown"
+        image_description = "No image provided"
+        environmental_claims = []
+        compressed_image_base64 = None

-    if not user_description :
-        return jsonify ({"error":"Description is required"}),400 
+        if report_type == 'product' and image_base64:
+            try:
+                if ',' in image_base64:
+                    image_base64 = image_base64.split(',')[1]

-    try :
+                image_bytes = base64.b64decode(image_base64)
+                
+                # Compress for storage
+                compressed_image_base64 = compress_image(image_bytes, max_width=600, quality=75)

-        detected_brand ="Unknown"
-        image_description ="No image provided"
-        environmental_claims =[]
-        compressed_image_base64 =None 
+                # Analyze image with Ollama
+                image_analysis = analyze_image_with_ollama(image_bytes)
+                
+                if image_analysis.logos_detected:
+                    detected_brand = image_analysis.logos_detected[0].brand
+                
+                image_description = image_analysis.description
+                environmental_claims = image_analysis.environmental_claims

-        if report_type =='product'and image_base64 :
-            try :
+            except Exception as e:
+                print(f"Image processing error: {e}")

-                if ','in image_base64 :
-                    image_base64 =image_base64 .split (',')[1 ]
+        # RAG Search context
+        search_query = f"{product_name} {detected_brand} environmental claims sustainability greenwashing"
+        query_embedding = get_embedding(search_query)
+        search_results = search_documents(query_embedding, num_results=20)

-                image_bytes =base64 .b64decode (image_base64 )
+        context = ""
+        for res in search_results:
+            context += f"--- Document ---\n{res['text'][:500]}\n\n"

+        if not context:
+            context = "No prior information found about this company in our database."

-                print ("Compressing image with OpenCV...")
-                compressed_image_base64 =compress_image (image_bytes ,max_width =600 ,quality =75 )
+        if environmental_claims:
+            context += "\n--- Claims visible in submitted image ---\n"
+            context += "\n".join(f"- {claim}" for claim in environmental_claims)

-
-                image_analysis =analyze_image_with_ollama (image_bytes )
-
-                if image_analysis .logos_detected :
-                    detected_brand =image_analysis .logos_detected [0 ].brand 
-
-                image_description =image_analysis .description 
-                environmental_claims =image_analysis .environmental_claims 
-
-            except Exception as e :
-                print (f"Image processing error: {e }")
-
-
-
-
-        search_query =f"{product_name } {detected_brand } environmental claims sustainability greenwashing"
-        query_embedding =get_embedding (search_query )
-        search_results =search_documents (query_embedding ,num_results =5 )
-
-        context =""
-        for res in search_results :
-            context +=f"--- Document ---\n{res ['text'][:500 ]}\n\n"
-
-        if not context :
-            context ="No prior information found about this company in our database."
-
-
-        if environmental_claims :
-            context +="\n--- Claims visible in submitted image ---\n"
-            context +="\n".join (f"- {claim }"for claim in environmental_claims )
-
-
-        analysis =analyze_with_gemini (
-        product_name =product_name ,
-        user_description =user_description ,
-        detected_brand =detected_brand ,
-        image_description =image_description ,
-        context =context 
+        # Main Analysis
+        analysis = analyze_with_gemini(
+            product_name=product_name,
+            user_description=user_description,
+            detected_brand=detected_brand,
+            image_description=image_description,
+            context=context
        )

+        analysis_dict = analysis.model_dump()

-        analysis_dict =analysis .model_dump ()
-
-
-        incident_data ={
-        "product_name":product_name ,
-        "user_description":user_description ,
-        "detected_brand":detected_brand ,
-        "image_description":image_description ,
-        "environmental_claims":environmental_claims ,
-        "analysis":analysis_dict ,
-        "is_greenwashing":analysis .is_greenwashing ,
-        "created_at":datetime .utcnow ().isoformat (),
-        "status":"confirmed"if analysis .is_greenwashing else "dismissed",
-        "report_type":report_type 
+        incident_data = {
+            "product_name": product_name,
+            "user_description": user_description,
+            "detected_brand": detected_brand,
+            "image_description": image_description,
+            "environmental_claims": environmental_claims,
+            "analysis": analysis_dict,
+            "is_greenwashing": analysis.is_greenwashing,
+            "created_at": datetime.utcnow().isoformat(),
+            "status": "confirmed" if analysis.is_greenwashing else "dismissed",
+            "report_type": report_type,
+            "user_id": user_id,
+            "is_public": is_public
        }

+        if compressed_image_base64:
+            incident_data["image_base64"] = compressed_image_base64

-        if compressed_image_base64 :
-            incident_data ["image_base64"]=compressed_image_base64 
+        # Save to MongoDB (All scans)
+        incident_id = save_to_mongodb(incident_data)

-        incident_id =None 
+        # Save to Vector Store ONLY if Greenwashing AND Public
+        if analysis.is_greenwashing and is_public:
+            save_to_chromadb(incident_data, incident_id)

-
-        if analysis .is_greenwashing :
-
-            incident_id =save_to_mongodb (incident_data )
-
-
-            save_to_chromadb (incident_data ,incident_id )
-
-        return jsonify ({
-        "status":"success",
-        "is_greenwashing":analysis .is_greenwashing ,
-        "incident_id":incident_id ,
-        "analysis":analysis_dict ,
-        "detected_brand":detected_brand ,
-        "environmental_claims":environmental_claims 
+        return jsonify({
+            "status": "success",
+            "is_greenwashing": analysis.is_greenwashing,
+            "incident_id": incident_id,
+            "analysis": analysis_dict,
+            "detected_brand": detected_brand,
+            "environmental_claims": environmental_claims
        })

-    except Exception as e :
-        import traceback 
-        traceback .print_exc ()
-        return jsonify ({
-        "status":"error",
-        "message":str (e )
-        }),500 
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return jsonify({
+            "status": "error",
+            "message": str(e)
+        }), 500


-@incidents_bp .route ('/list',methods =['GET'])
-def list_incidents ():
-    """Get all confirmed greenwashing incidents"""
-    try :
-        client =get_mongo_client ()
-        db =client ["ethix"]
-        collection =db ["incidents"]
+@incidents_bp.route('/list', methods=['GET'])
+def list_incidents():
+    """Get all PUBLIC confirmed greenwashing incidents"""
+    try:
+        client = get_mongo_client()
+        db = client["ethix"]
+        collection = db["incidents"]
+
+        # Filter: Public AND Greenwashing
+        query = {
+            "is_greenwashing": True,
+            "is_public": True 
+            # Note: Legacy records without 'is_public' might be hidden. 
+            # For migration, we might want to treat missing as True or update DB.
+            # Assuming strictly filtered for now.
+        }
+
+        incidents = list(collection.find(
+            query,
+            {"_id": 1, "product_name": 1, "detected_brand": 1, 
+             "user_description": 1, "analysis": 1, "created_at": 1, 
+             "image_base64": 1, "report_type": 1}
+        ).sort("created_at", -1).limit(50))
+
+        for inc in incidents:
+            inc["_id"] = str(inc["_id"])
+
+        return jsonify(incidents)
+
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500


-        incidents =list (collection .find (
-        {"is_greenwashing":True },
-        {"_id":1 ,"product_name":1 ,"detected_brand":1 ,
-        "user_description":1 ,"analysis":1 ,"created_at":1 ,
-        "image_base64":1 ,"report_type":1 }
-        ).sort ("created_at",-1 ).limit (50 ))
+@incidents_bp.route('/history', methods=['GET'])
+def get_user_history():
+    """Get scan history for a specific user"""
+    user_id = request.args.get('user_id')
+    if not user_id:
+        return jsonify({"error": "user_id required"}), 400
+
+    try:
+        client = get_mongo_client()
+        db = client["ethix"]
+        collection = db["incidents"]
+
+        query = {"user_id": user_id}
+        
+        incidents = list(collection.find(
+            query,
+            {"_id": 1, "product_name": 1, "detected_brand": 1, 
+             "analysis": 1, "created_at": 1, "image_base64": 1, "is_public": 1}
+        ).sort("created_at", -1).limit(50))
+
+        for inc in incidents:
+            inc["_id"] = str(inc["_id"])
+
+        return jsonify(incidents)
+
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500 


-        for inc in incidents :
-            inc ["_id"]=str (inc ["_id"])
+@incidents_bp.route('/<incident_id>/visibility', methods=['PUT'])
+def update_visibility(incident_id):
+    """Update incident visibility (public/private)"""
+    try:
+        from bson import ObjectId
+        data = request.json
+        is_public = data.get('is_public')
+        
+        if is_public is None:
+             return jsonify({"error": "is_public required"}), 400

-        return jsonify (incidents )
+        client = get_mongo_client()
+        db = client["ethix"]
+        collection = db["incidents"]

-    except Exception as e :
-        return jsonify ({"error":str (e )}),500 
+        # 1. Update MongoDB
+        result = collection.update_one(
+            {"_id": ObjectId(incident_id)},
+            {"$set": {"is_public": is_public}}
+        )

+        if result.matched_count == 0:
+             return jsonify({"error": "Incident not found"}), 404
+             
+        # 2. Sync with ChromaDB (Vector Store)
+        # We need the incident data to insert/delete
+        incident = collection.find_one({"_id": ObjectId(incident_id)})
+        
+        if is_public and incident.get("is_greenwashing", False):
+            # If public and greenwashing -> Add to Chroma
+            save_to_chromadb(incident, incident_id)
+        else:
+            # If private OR not greenwashing -> Remove from Chroma
+            # We need delete functionality. delete_documents_by_source uses 'source' metadata.
+            from src.chroma.vector_store import delete_documents_by_source
+            delete_documents_by_source(f"incident_{incident_id}")

-@incidents_bp .route ('/<incident_id>',methods =['GET'])
+        return jsonify({"status": "success", "is_public": is_public})
+
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return jsonify({"error": str(e)}), 500
 def get_incident (incident_id ):
    """Get a specific incident by ID"""
    try :