AI Prompt and RAG improvements

2026-02-04 03:34:34 -05:00 · 2026-01-26 12:34:00 +00:00
parent 001ff97518
commit f941d8342b
3 changed files with 66 additions and 32 deletions
--- a/backend/src/chroma/chroma_store.py
+++ b/backend/src/chroma/chroma_store.py
@@ -1,5 +1,6 @@
 import chromadb 
 import os
 import hashlib
 CHROMA_HOST = os.environ.get("CHROMA_HOST", "http://chroma.sirblob.co")
 COLLECTION_NAME = "rag_documents"
@@ -16,26 +17,38 @@ def get_collection (collection_name =COLLECTION_NAME ):
    client =get_chroma_client ()
    return client .get_or_create_collection (name =collection_name )
-def insert_documents (texts ,embeddings ,collection_name =COLLECTION_NAME ,metadata_list =None ):
+def insert_documents(texts, embeddings, collection_name=COLLECTION_NAME, metadata_list=None):
-    collection =get_collection (collection_name )
+    collection = get_collection(collection_name)
-    ids =[f"doc_{i }_{hash (text )}"for i ,text in enumerate (texts )]
+    # Generate stable IDs using SHA256
    ids = [f"doc_{i}_{hashlib.sha256(text.encode('utf-8')).hexdigest()[:16]}" for i, text in enumerate(texts)]
-    if metadata_list :
+    total_count = len(texts)
-        collection .add (
+    batch_size = 2000
-        ids =ids ,
+    
-        embeddings =embeddings ,
+    for i in range(0, total_count, batch_size):
-        documents =texts ,
+        end = min(i + batch_size, total_count)
-        metadatas =metadata_list 
+        
-        )
+        batch_ids = ids[i:end]
-    else :
+        batch_texts = texts[i:end]
-        collection .add (
+        batch_embeddings = embeddings[i:end]
-        ids =ids ,
+        batch_metadatas = metadata_list[i:end] if metadata_list else None
-        embeddings =embeddings ,
+        
-        documents =texts 
+        if batch_metadatas:
-        )
+            collection.add(
                ids=batch_ids,
                embeddings=batch_embeddings,
                documents=batch_texts,
                metadatas=batch_metadatas
            )
        else:
            collection.add(
                ids=batch_ids,
                embeddings=batch_embeddings,
                documents=batch_texts
            )
-    return len (texts )
+    return total_count
 def search_documents (query_embedding ,collection_name =COLLECTION_NAME ,num_results =5 ,filter_metadata =None ):
    collection =get_collection (collection_name )
--- a/backend/src/gemini/init.py
+++ b/backend/src/gemini/init.py
@@ -3,22 +3,34 @@ from google import genai
 from src .chroma .vector_store import search_documents 
 from src .rag .embeddings import get_embedding 
-GREENWASHING_ANALYSIS_PROMPT ="""
+GREENWASHING_ANALYSIS_PROMPT = """
-You are an expert Environmental, Social, and Governance (ESG) Analyst specialized in detecting 'Greenwashing'.
+You are a Forensics ESG Analyst specialized in detecting Greenwashing.
-Your task is to analyze the provided context from a company's data reports and determine if they are engaging in greenwashing.
+Your objective is to audit the provided company report excerpts and expose any misleading sustainability claims.
-Greenwashing is defined as making misleading or unsubstantiated claims about the environmental benefits of a product, service, or company practice.
+### DEFINITION
 Greenwashing: The practice of making unsubstantiated or misleading claims about the environmental benefits of a product, service, or practice.
-Please evaluate the following:
+### ANALYSIS FRAMEWORK
-1. Vague Claims: Are they using broad terms like 'eco-friendly' without specific details?
+Analyze the provided text against these criteria:
-2. Lack of Proof: Are claims backed by data, third-party certifications, or specific metrics?
+1.  **Vague Terminology**: Usage of buzzwords ("eco-friendly", "green", "sustainable") without quantifiable definitions.
-3. Hidden Trade-offs: Do they highlight one green act while ignoring a much larger environmental harm?
+2.  **No Proof**: Claims lacking specific metrics (e.g., "reduced emissions" vs "reduced CO2 by 15% compared to 2020 baseline").
-4. Symbolic Actions: Are they focusing on minor changes while their core business remains highly polluting?
+3.  **Hidden Trade-offs**: Emphasizing a minor eco-feature while ignoring major negative impacts (e.g., recyclable packaging on a toxic product).
 4.  **Irrelevance**: Citing standard compliance (legal requirements) as proactive sustainability achievements.
-Based on the context provided, give a final verdict:
+### OUTPUT FORMAT
- VERDICT: [Clear/Suspect/High Risk of Greenwashing]
+Provide a structured analysis:
- REASONING: [Explain your findings clearly]
+
- EVIDENCE: [Quote specific parts of the context if possible]
+**Verdict**: [LOW RISK / MODERATE RISK / HIGH RISK / CONFIRMED GREENWASHING]
 **Key Findings**:
 *   [Finding 1]: [Explanation]
 *   [Finding 2]: [Explanation]
 **Evidence**:
 *   "[Quote from text]" -> *Critique of why this is problematic or good.*
 **Conclusion**:
 A brief 1-2 sentence summary of the brand's honesty regarding this topic.
 """
 def ask (prompt ):
--- a/backend/src/rag/gemeni.py
+++ b/backend/src/rag/gemeni.py
@@ -13,7 +13,7 @@ class GeminiClient :
            raise ValueError ("No GOOGLE_API_KEY found in .env file!")
        self .client =genai .Client (api_key =self .api_key )
-        self .model_name ="gemini-2.0-flash"
+        self .model_name ="gemini-3-pro-preview"
    def ask (self ,prompt ,context =""):
        try :
@@ -26,7 +26,16 @@ class GeminiClient :
            model =self .model_name ,
            contents =full_message ,
            config ={
-            'system_instruction':'You are Ethix, an expert sustainability assistant. You have access to a database including Georgetown University sustainability reports. SEARCH THE PROVIDED CONTEXT CAREFULLY. If the context contains ANY information about Georgetown University or the user\'s query, matches, or partial matches, YOU MUST USE IT to answer. Ignore irrelevant parts of the context. If no context matches, provide general expert advice. Keep responses concise (max 6 sentences).'
+            'system_instruction': (
                "You are Ethix, an expert sustainability AI assistant. "
                "Your mission is to analyze sustainability reports and provide data-driven insights. "
                "You have access to a database of ESG reports (including Georgetown University). "
                "CRITICAL INSTRUCTIONS: "
                "1. PRIORITIZE CONTEXT: If the provided background context contains relevant data, YOU MUST USE IT and explicitly cite it. "
                "2. DETECT GREENWASHING: Be vigilant for vague buzzwords ('eco-friendly', 'green') lacking specific metrics. Flag them. "
                "3. ACCURACY: Do not hallucinate data. If the context is empty or irrelevant to the query, state 'I don't have specific data on this in my current reports' before offering general expert knowledge. "
                "4. FORMATTING: Use Markdown. Keep answers professional, concise, and structured."
            )
            }
            )
            return response .text