Docker Update and Fixes

This commit is contained in:
2026-01-25 17:36:15 +00:00
parent b7b718d4ca
commit 295be1ed8e
21 changed files with 886 additions and 289 deletions

View File

@@ -12,3 +12,4 @@ flask-cors
ollama
chromadb-client
pymongo
google-genai

View File

@@ -18,7 +18,7 @@ class GeminiClient :
def ask (self ,prompt ,context =""):
try :
if context :
full_message =f"Use this information to answer: {context }\n\nQuestion: {prompt }"
full_message =f"Background Context:\n{context }\n\nUser Question: {prompt }"
else :
full_message =prompt
@@ -26,7 +26,7 @@ class GeminiClient :
model =self .model_name ,
contents =full_message ,
config ={
'system_instruction':'You are a concise sustainability assistant. Your responses must be a single short paragraph, maximum 6 sentences long. Do not use bullet points or multiple sections.'
'system_instruction':'You are Ethix, an expert sustainability assistant. You have access to a database including Georgetown University sustainability reports. SEARCH THE PROVIDED CONTEXT CAREFULLY. If the context contains ANY information about Georgetown University or the user\'s query, matches, or partial matches, YOU MUST USE IT to answer. Ignore irrelevant parts of the context. If no context matches, provide general expert advice. Keep responses concise (max 6 sentences).'
}
)
return response .text

View File

@@ -27,17 +27,53 @@ def ask():
print(f"Generating embedding for prompt: {prompt}")
query_embedding = get_embedding(prompt)
print("Searching ChromaDB for context...")
search_results = search_documents(query_embedding, num_results=15)
print("Searching Vector Database for context...")
search_results = search_documents(query_embedding, num_results=50)
# Special handling for Georgetown University queries to ensure those docs are included
# even if generic corporate reports outrank them in vector search.
if "georgetown" in prompt.lower():
try:
from src.mongo.connection import get_mongo_client
client = get_mongo_client()
# Use hardcoded DB name to match vector_store.py
db = client["ethix_vectors"]
collection = db["rag_documents"]
# Fetch docs with Georgetown or MAP_INFO in the filename/source
gt_docs = list(collection.find({"source": {"$regex": "Georgetown|MAP_INFO", "$options": "i"}}).limit(30))
if gt_docs:
print(f"Direct Match: Found {len(gt_docs)} Georgetown specific documents.")
for doc in gt_docs:
# Normalize to match search_results format
search_results.append({
"text": doc.get("text", ""),
"metadata": doc.get("metadata", {"source": doc.get("source", "Georgetown File")}),
"score": 1.0 # High priority
})
except Exception as e:
print(f"Error checking Georgetown docs: {e}")
retrieved_context = ""
if search_results:
print(f"Found {len(search_results)} documents.")
retrieved_context = "RELEVANT INFORMATION FROM DATABASE:\n"
print(f"Found {len(search_results)} documents (total).")
print("Sources found:")
seen_sources = set()
for res in search_results:
# Include metadata if useful, e.g. brand name or date
meta = res.get('metadata', {})
source_info = f"[Source: {meta.get('type', 'doc')} - {meta.get('product_name', 'Unknown')}]"
source = meta.get('source', 'unknown')
# Deduplication of printing and adding context if exact text overlap?
# For now just append. LLM can handle duplication.
if source not in seen_sources:
print(f" - {source}")
seen_sources.add(source)
source_info = f"[Source: {meta.get('type', 'doc')} - {source}]"
retrieved_context += f"{source_info}\n{res['text']}\n\n"
else:
print("No relevant documents found.")

View File

@@ -86,6 +86,7 @@ class GreenwashingAnalysis (BaseModel ):
recommendations :str =Field (description ="What consumers should know about this case")
key_claims :List [str ]=Field (description ="List of specific environmental claims made by the company")
red_flags :List [str ]=Field (description ="List of red flags or concerning practices identified")
alternatives :List [str ]=Field (description ="List of sustainable alternatives or better choices")
class LogoDetection (BaseModel ):
@@ -124,8 +125,11 @@ Based on this information, determine if this is a valid case of greenwashing. Co
2. Are their eco-friendly claims vague or unsubstantiated?
3. Is there a disconnect between their marketing and actual practices?
4. Are they using green imagery or terms without substance?
Provide your analysis in the structured format requested."""
128: 5. Suggest better, clearer, or more sustainable alternatives if applicable.
129:
130: If the provided context includes university-specific information (e.g., Georgetown University), incorporate it into your recommendations where relevant (e.g., disposal instructions, local alternatives).
131:
132: Provide your analysis in the structured format requested."""
def analyze_with_gemini (product_name :str ,user_description :str ,detected_brand :str ,
@@ -147,7 +151,7 @@ image_description :str ,context :str )->GreenwashingAnalysis :
response =client .models .generate_content (
model ="gemini-3-pro-preview",
model ="gemini-2.0-flash-exp",
contents =prompt ,
config ={
"response_mime_type":"application/json",
@@ -311,169 +315,240 @@ This incident has been documented for future reference and to help inform sustai
@incidents_bp .route ('/submit',methods =['POST'])
def submit_incident ():
@incidents_bp.route('/submit', methods=['POST'])
def submit_incident():
"""
Submit a greenwashing incident report
Expects JSON with:
- product_name: Name of the product/company
- description: User's description of the misleading claim
- description: User's description
- report_type: 'product' or 'company'
- image: Base64 encoded image (for product reports)
- pdf_data: Base64 encoded PDF (for company reports)
- image: Base64 encoded image
- user_id: ID of the user submitting
- is_public: Boolean, whether to make it public
"""
data =request .json
data = request.json
if not data:
return jsonify({"error": "No data provided"}), 400
if not data :
return jsonify ({"error":"No data provided"}),400
product_name = data.get('product_name', '').strip()
user_description = data.get('description', '').strip()
report_type = data.get('report_type', 'product')
image_base64 = data.get('image')
user_id = data.get('user_id', 'anonymous')
is_public = data.get('is_public', False)
product_name =data .get ('product_name','').strip ()
user_description =data .get ('description','').strip ()
report_type =data .get ('report_type','product')
image_base64 =data .get ('image')
if not product_name:
return jsonify({"error": "Product name is required"}), 400
# Description isn't strictly required if image provides context, but good enforcement
if not user_description:
user_description = "No description provided."
if not product_name :
return jsonify ({"error":"Product name is required"}),400
try:
detected_brand = "Unknown"
image_description = "No image provided"
environmental_claims = []
compressed_image_base64 = None
if not user_description :
return jsonify ({"error":"Description is required"}),400
if report_type == 'product' and image_base64:
try:
if ',' in image_base64:
image_base64 = image_base64.split(',')[1]
try :
image_bytes = base64.b64decode(image_base64)
# Compress for storage
compressed_image_base64 = compress_image(image_bytes, max_width=600, quality=75)
detected_brand ="Unknown"
image_description ="No image provided"
environmental_claims =[]
compressed_image_base64 =None
# Analyze image with Ollama
image_analysis = analyze_image_with_ollama(image_bytes)
if image_analysis.logos_detected:
detected_brand = image_analysis.logos_detected[0].brand
image_description = image_analysis.description
environmental_claims = image_analysis.environmental_claims
if report_type =='product'and image_base64 :
try :
except Exception as e:
print(f"Image processing error: {e}")
if ','in image_base64 :
image_base64 =image_base64 .split (',')[1 ]
# RAG Search context
search_query = f"{product_name} {detected_brand} environmental claims sustainability greenwashing"
query_embedding = get_embedding(search_query)
search_results = search_documents(query_embedding, num_results=20)
image_bytes =base64 .b64decode (image_base64 )
context = ""
for res in search_results:
context += f"--- Document ---\n{res['text'][:500]}\n\n"
if not context:
context = "No prior information found about this company in our database."
print ("Compressing image with OpenCV...")
compressed_image_base64 =compress_image (image_bytes ,max_width =600 ,quality =75 )
if environmental_claims:
context += "\n--- Claims visible in submitted image ---\n"
context += "\n".join(f"- {claim}" for claim in environmental_claims)
image_analysis =analyze_image_with_ollama (image_bytes )
if image_analysis .logos_detected :
detected_brand =image_analysis .logos_detected [0 ].brand
image_description =image_analysis .description
environmental_claims =image_analysis .environmental_claims
except Exception as e :
print (f"Image processing error: {e }")
search_query =f"{product_name } {detected_brand } environmental claims sustainability greenwashing"
query_embedding =get_embedding (search_query )
search_results =search_documents (query_embedding ,num_results =5 )
context =""
for res in search_results :
context +=f"--- Document ---\n{res ['text'][:500 ]}\n\n"
if not context :
context ="No prior information found about this company in our database."
if environmental_claims :
context +="\n--- Claims visible in submitted image ---\n"
context +="\n".join (f"- {claim }"for claim in environmental_claims )
analysis =analyze_with_gemini (
product_name =product_name ,
user_description =user_description ,
detected_brand =detected_brand ,
image_description =image_description ,
context =context
# Main Analysis
analysis = analyze_with_gemini(
product_name=product_name,
user_description=user_description,
detected_brand=detected_brand,
image_description=image_description,
context=context
)
analysis_dict = analysis.model_dump()
analysis_dict =analysis .model_dump ()
incident_data ={
"product_name":product_name ,
"user_description":user_description ,
"detected_brand":detected_brand ,
"image_description":image_description ,
"environmental_claims":environmental_claims ,
"analysis":analysis_dict ,
"is_greenwashing":analysis .is_greenwashing ,
"created_at":datetime .utcnow ().isoformat (),
"status":"confirmed"if analysis .is_greenwashing else "dismissed",
"report_type":report_type
incident_data = {
"product_name": product_name,
"user_description": user_description,
"detected_brand": detected_brand,
"image_description": image_description,
"environmental_claims": environmental_claims,
"analysis": analysis_dict,
"is_greenwashing": analysis.is_greenwashing,
"created_at": datetime.utcnow().isoformat(),
"status": "confirmed" if analysis.is_greenwashing else "dismissed",
"report_type": report_type,
"user_id": user_id,
"is_public": is_public
}
if compressed_image_base64:
incident_data["image_base64"] = compressed_image_base64
if compressed_image_base64 :
incident_data ["image_base64"]=compressed_image_base64
# Save to MongoDB (All scans)
incident_id = save_to_mongodb(incident_data)
incident_id =None
# Save to Vector Store ONLY if Greenwashing AND Public
if analysis.is_greenwashing and is_public:
save_to_chromadb(incident_data, incident_id)
if analysis .is_greenwashing :
incident_id =save_to_mongodb (incident_data )
save_to_chromadb (incident_data ,incident_id )
return jsonify ({
"status":"success",
"is_greenwashing":analysis .is_greenwashing ,
"incident_id":incident_id ,
"analysis":analysis_dict ,
"detected_brand":detected_brand ,
"environmental_claims":environmental_claims
return jsonify({
"status": "success",
"is_greenwashing": analysis.is_greenwashing,
"incident_id": incident_id,
"analysis": analysis_dict,
"detected_brand": detected_brand,
"environmental_claims": environmental_claims
})
except Exception as e :
import traceback
traceback .print_exc ()
return jsonify ({
"status":"error",
"message":str (e )
}),500
except Exception as e:
import traceback
traceback.print_exc()
return jsonify({
"status": "error",
"message": str(e)
}), 500
@incidents_bp .route ('/list',methods =['GET'])
def list_incidents ():
"""Get all confirmed greenwashing incidents"""
try :
client =get_mongo_client ()
db =client ["ethix"]
collection =db ["incidents"]
@incidents_bp.route('/list', methods=['GET'])
def list_incidents():
"""Get all PUBLIC confirmed greenwashing incidents"""
try:
client = get_mongo_client()
db = client["ethix"]
collection = db["incidents"]
# Filter: Public AND Greenwashing
query = {
"is_greenwashing": True,
"is_public": True
# Note: Legacy records without 'is_public' might be hidden.
# For migration, we might want to treat missing as True or update DB.
# Assuming strictly filtered for now.
}
incidents = list(collection.find(
query,
{"_id": 1, "product_name": 1, "detected_brand": 1,
"user_description": 1, "analysis": 1, "created_at": 1,
"image_base64": 1, "report_type": 1}
).sort("created_at", -1).limit(50))
for inc in incidents:
inc["_id"] = str(inc["_id"])
return jsonify(incidents)
except Exception as e:
return jsonify({"error": str(e)}), 500
incidents =list (collection .find (
{"is_greenwashing":True },
{"_id":1 ,"product_name":1 ,"detected_brand":1 ,
"user_description":1 ,"analysis":1 ,"created_at":1 ,
"image_base64":1 ,"report_type":1 }
).sort ("created_at",-1 ).limit (50 ))
@incidents_bp.route('/history', methods=['GET'])
def get_user_history():
"""Get scan history for a specific user"""
user_id = request.args.get('user_id')
if not user_id:
return jsonify({"error": "user_id required"}), 400
try:
client = get_mongo_client()
db = client["ethix"]
collection = db["incidents"]
query = {"user_id": user_id}
incidents = list(collection.find(
query,
{"_id": 1, "product_name": 1, "detected_brand": 1,
"analysis": 1, "created_at": 1, "image_base64": 1, "is_public": 1}
).sort("created_at", -1).limit(50))
for inc in incidents:
inc["_id"] = str(inc["_id"])
return jsonify(incidents)
except Exception as e:
return jsonify({"error": str(e)}), 500
for inc in incidents :
inc ["_id"]=str (inc ["_id"])
@incidents_bp.route('/<incident_id>/visibility', methods=['PUT'])
def update_visibility(incident_id):
"""Update incident visibility (public/private)"""
try:
from bson import ObjectId
data = request.json
is_public = data.get('is_public')
if is_public is None:
return jsonify({"error": "is_public required"}), 400
return jsonify (incidents )
client = get_mongo_client()
db = client["ethix"]
collection = db["incidents"]
except Exception as e :
return jsonify ({"error":str (e )}),500
# 1. Update MongoDB
result = collection.update_one(
{"_id": ObjectId(incident_id)},
{"$set": {"is_public": is_public}}
)
if result.matched_count == 0:
return jsonify({"error": "Incident not found"}), 404
# 2. Sync with ChromaDB (Vector Store)
# We need the incident data to insert/delete
incident = collection.find_one({"_id": ObjectId(incident_id)})
if is_public and incident.get("is_greenwashing", False):
# If public and greenwashing -> Add to Chroma
save_to_chromadb(incident, incident_id)
else:
# If private OR not greenwashing -> Remove from Chroma
# We need delete functionality. delete_documents_by_source uses 'source' metadata.
from src.chroma.vector_store import delete_documents_by_source
delete_documents_by_source(f"incident_{incident_id}")
@incidents_bp .route ('/<incident_id>',methods =['GET'])
return jsonify({"status": "success", "is_public": is_public})
except Exception as e:
import traceback
traceback.print_exc()
return jsonify({"error": str(e)}), 500
def get_incident (incident_id ):
"""Get a specific incident by ID"""
try :