AI Prompt and RAG improvements

2026-06-22 19:16:24 -04:00 · 2026-01-26 12:34:00 +00:00
parent 001ff97518
commit f941d8342b
3 changed files with 66 additions and 32 deletions
@@ -1,5 +1,6 @@
 import chromadb 
 import os
+import hashlib

 CHROMA_HOST = os.environ.get("CHROMA_HOST", "http://chroma.sirblob.co")
 COLLECTION_NAME = "rag_documents"
@@ -16,26 +17,38 @@ def get_collection (collection_name =COLLECTION_NAME ):
    client =get_chroma_client ()
    return client .get_or_create_collection (name =collection_name )

-def insert_documents (texts ,embeddings ,collection_name =COLLECTION_NAME ,metadata_list =None ):
-    collection =get_collection (collection_name )
+def insert_documents(texts, embeddings, collection_name=COLLECTION_NAME, metadata_list=None):
+    collection = get_collection(collection_name)

-    ids =[f"doc_{i }_{hash (text )}"for i ,text in enumerate (texts )]
+    # Generate stable IDs using SHA256
+    ids = [f"doc_{i}_{hashlib.sha256(text.encode('utf-8')).hexdigest()[:16]}" for i, text in enumerate(texts)]

-    if metadata_list :
-        collection .add (
-        ids =ids ,
-        embeddings =embeddings ,
-        documents =texts ,
-        metadatas =metadata_list 
-        )
-    else :
-        collection .add (
-        ids =ids ,
-        embeddings =embeddings ,
-        documents =texts 
-        )
+    total_count = len(texts)
+    batch_size = 2000
    
-    return len (texts )
+    for i in range(0, total_count, batch_size):
+        end = min(i + batch_size, total_count)
+        
+        batch_ids = ids[i:end]
+        batch_texts = texts[i:end]
+        batch_embeddings = embeddings[i:end]
+        batch_metadatas = metadata_list[i:end] if metadata_list else None
+        
+        if batch_metadatas:
+            collection.add(
+                ids=batch_ids,
+                embeddings=batch_embeddings,
+                documents=batch_texts,
+                metadatas=batch_metadatas
+            )
+        else:
+            collection.add(
+                ids=batch_ids,
+                embeddings=batch_embeddings,
+                documents=batch_texts
+            )
+
+    return total_count

 def search_documents (query_embedding ,collection_name =COLLECTION_NAME ,num_results =5 ,filter_metadata =None ):
    collection =get_collection (collection_name )
@@ -3,22 +3,34 @@ from google import genai
 from src .chroma .vector_store import search_documents 
 from src .rag .embeddings import get_embedding 

-GREENWASHING_ANALYSIS_PROMPT ="""
-You are an expert Environmental, Social, and Governance (ESG) Analyst specialized in detecting 'Greenwashing'.
-Your task is to analyze the provided context from a company's data reports and determine if they are engaging in greenwashing.
+GREENWASHING_ANALYSIS_PROMPT = """
+You are a Forensics ESG Analyst specialized in detecting Greenwashing.
+Your objective is to audit the provided company report excerpts and expose any misleading sustainability claims.

-Greenwashing is defined as making misleading or unsubstantiated claims about the environmental benefits of a product, service, or company practice.
+### DEFINITION
+Greenwashing: The practice of making unsubstantiated or misleading claims about the environmental benefits of a product, service, or practice.

-Please evaluate the following:
-1. Vague Claims: Are they using broad terms like 'eco-friendly' without specific details?
-2. Lack of Proof: Are claims backed by data, third-party certifications, or specific metrics?
-3. Hidden Trade-offs: Do they highlight one green act while ignoring a much larger environmental harm?
-4. Symbolic Actions: Are they focusing on minor changes while their core business remains highly polluting?
+### ANALYSIS FRAMEWORK
+Analyze the provided text against these criteria:
+1.  **Vague Terminology**: Usage of buzzwords ("eco-friendly", "green", "sustainable") without quantifiable definitions.
+2.  **No Proof**: Claims lacking specific metrics (e.g., "reduced emissions" vs "reduced CO2 by 15% compared to 2020 baseline").
+3.  **Hidden Trade-offs**: Emphasizing a minor eco-feature while ignoring major negative impacts (e.g., recyclable packaging on a toxic product).
+4.  **Irrelevance**: Citing standard compliance (legal requirements) as proactive sustainability achievements.

-Based on the context provided, give a final verdict:
- VERDICT: [Clear/Suspect/High Risk of Greenwashing]
- REASONING: [Explain your findings clearly]
- EVIDENCE: [Quote specific parts of the context if possible]
+### OUTPUT FORMAT
+Provide a structured analysis:
+
+**Verdict**: [LOW RISK / MODERATE RISK / HIGH RISK / CONFIRMED GREENWASHING]
+
+**Key Findings**:
+*   [Finding 1]: [Explanation]
+*   [Finding 2]: [Explanation]
+
+**Evidence**:
+*   "[Quote from text]" -> *Critique of why this is problematic or good.*
+
+**Conclusion**:
+A brief 1-2 sentence summary of the brand's honesty regarding this topic.
 """

 def ask (prompt ):
@@ -13,7 +13,7 @@ class GeminiClient :
            raise ValueError ("No GOOGLE_API_KEY found in .env file!")

        self .client =genai .Client (api_key =self .api_key )
-        self .model_name ="gemini-2.0-flash"
+        self .model_name ="gemini-3-pro-preview"

    def ask (self ,prompt ,context =""):
        try :
@@ -26,7 +26,16 @@ class GeminiClient :
            model =self .model_name ,
            contents =full_message ,
            config ={
-            'system_instruction':'You are Ethix, an expert sustainability assistant. You have access to a database including Georgetown University sustainability reports. SEARCH THE PROVIDED CONTEXT CAREFULLY. If the context contains ANY information about Georgetown University or the user\'s query, matches, or partial matches, YOU MUST USE IT to answer. Ignore irrelevant parts of the context. If no context matches, provide general expert advice. Keep responses concise (max 6 sentences).'
+            'system_instruction': (
+                "You are Ethix, an expert sustainability AI assistant. "
+                "Your mission is to analyze sustainability reports and provide data-driven insights. "
+                "You have access to a database of ESG reports (including Georgetown University). "
+                "CRITICAL INSTRUCTIONS: "
+                "1. PRIORITIZE CONTEXT: If the provided background context contains relevant data, YOU MUST USE IT and explicitly cite it. "
+                "2. DETECT GREENWASHING: Be vigilant for vague buzzwords ('eco-friendly', 'green') lacking specific metrics. Flag them. "
+                "3. ACCURACY: Do not hallucinate data. If the context is empty or irrelevant to the query, state 'I don't have specific data on this in my current reports' before offering general expert knowledge. "
+                "4. FORMATTING: Use Markdown. Keep answers professional, concise, and structured."
+            )
            }
            )
            return response .text