AI Prompt and RAG improvements

This commit is contained in:
2026-01-26 12:34:00 +00:00
parent 001ff97518
commit f941d8342b
3 changed files with 66 additions and 32 deletions

View File

@@ -1,5 +1,6 @@
import chromadb import chromadb
import os import os
import hashlib
CHROMA_HOST = os.environ.get("CHROMA_HOST", "http://chroma.sirblob.co") CHROMA_HOST = os.environ.get("CHROMA_HOST", "http://chroma.sirblob.co")
COLLECTION_NAME = "rag_documents" COLLECTION_NAME = "rag_documents"
@@ -19,23 +20,35 @@ def get_collection (collection_name =COLLECTION_NAME ):
def insert_documents(texts, embeddings, collection_name=COLLECTION_NAME, metadata_list=None): def insert_documents(texts, embeddings, collection_name=COLLECTION_NAME, metadata_list=None):
collection = get_collection(collection_name) collection = get_collection(collection_name)
ids =[f"doc_{i }_{hash (text )}"for i ,text in enumerate (texts )] # Generate stable IDs using SHA256
ids = [f"doc_{i}_{hashlib.sha256(text.encode('utf-8')).hexdigest()[:16]}" for i, text in enumerate(texts)]
if metadata_list : total_count = len(texts)
batch_size = 2000
for i in range(0, total_count, batch_size):
end = min(i + batch_size, total_count)
batch_ids = ids[i:end]
batch_texts = texts[i:end]
batch_embeddings = embeddings[i:end]
batch_metadatas = metadata_list[i:end] if metadata_list else None
if batch_metadatas:
collection.add( collection.add(
ids =ids , ids=batch_ids,
embeddings =embeddings , embeddings=batch_embeddings,
documents =texts , documents=batch_texts,
metadatas =metadata_list metadatas=batch_metadatas
) )
else: else:
collection.add( collection.add(
ids =ids , ids=batch_ids,
embeddings =embeddings , embeddings=batch_embeddings,
documents =texts documents=batch_texts
) )
return len (texts ) return total_count
def search_documents (query_embedding ,collection_name =COLLECTION_NAME ,num_results =5 ,filter_metadata =None ): def search_documents (query_embedding ,collection_name =COLLECTION_NAME ,num_results =5 ,filter_metadata =None ):
collection =get_collection (collection_name ) collection =get_collection (collection_name )

View File

@@ -4,21 +4,33 @@ from src .chroma .vector_store import search_documents
from src .rag .embeddings import get_embedding from src .rag .embeddings import get_embedding
GREENWASHING_ANALYSIS_PROMPT = """ GREENWASHING_ANALYSIS_PROMPT = """
You are an expert Environmental, Social, and Governance (ESG) Analyst specialized in detecting 'Greenwashing'. You are a Forensics ESG Analyst specialized in detecting Greenwashing.
Your task is to analyze the provided context from a company's data reports and determine if they are engaging in greenwashing. Your objective is to audit the provided company report excerpts and expose any misleading sustainability claims.
Greenwashing is defined as making misleading or unsubstantiated claims about the environmental benefits of a product, service, or company practice. ### DEFINITION
Greenwashing: The practice of making unsubstantiated or misleading claims about the environmental benefits of a product, service, or practice.
Please evaluate the following: ### ANALYSIS FRAMEWORK
1. Vague Claims: Are they using broad terms like 'eco-friendly' without specific details? Analyze the provided text against these criteria:
2. Lack of Proof: Are claims backed by data, third-party certifications, or specific metrics? 1. **Vague Terminology**: Usage of buzzwords ("eco-friendly", "green", "sustainable") without quantifiable definitions.
3. Hidden Trade-offs: Do they highlight one green act while ignoring a much larger environmental harm? 2. **No Proof**: Claims lacking specific metrics (e.g., "reduced emissions" vs "reduced CO2 by 15% compared to 2020 baseline").
4. Symbolic Actions: Are they focusing on minor changes while their core business remains highly polluting? 3. **Hidden Trade-offs**: Emphasizing a minor eco-feature while ignoring major negative impacts (e.g., recyclable packaging on a toxic product).
4. **Irrelevance**: Citing standard compliance (legal requirements) as proactive sustainability achievements.
Based on the context provided, give a final verdict: ### OUTPUT FORMAT
- VERDICT: [Clear/Suspect/High Risk of Greenwashing] Provide a structured analysis:
- REASONING: [Explain your findings clearly]
- EVIDENCE: [Quote specific parts of the context if possible] **Verdict**: [LOW RISK / MODERATE RISK / HIGH RISK / CONFIRMED GREENWASHING]
**Key Findings**:
* [Finding 1]: [Explanation]
* [Finding 2]: [Explanation]
**Evidence**:
* "[Quote from text]" -> *Critique of why this is problematic or good.*
**Conclusion**:
A brief 1-2 sentence summary of the brand's honesty regarding this topic.
""" """
def ask (prompt ): def ask (prompt ):

View File

@@ -13,7 +13,7 @@ class GeminiClient :
raise ValueError ("No GOOGLE_API_KEY found in .env file!") raise ValueError ("No GOOGLE_API_KEY found in .env file!")
self .client =genai .Client (api_key =self .api_key ) self .client =genai .Client (api_key =self .api_key )
self .model_name ="gemini-2.0-flash" self .model_name ="gemini-3-pro-preview"
def ask (self ,prompt ,context =""): def ask (self ,prompt ,context =""):
try : try :
@@ -26,7 +26,16 @@ class GeminiClient :
model =self .model_name , model =self .model_name ,
contents =full_message , contents =full_message ,
config ={ config ={
'system_instruction':'You are Ethix, an expert sustainability assistant. You have access to a database including Georgetown University sustainability reports. SEARCH THE PROVIDED CONTEXT CAREFULLY. If the context contains ANY information about Georgetown University or the user\'s query, matches, or partial matches, YOU MUST USE IT to answer. Ignore irrelevant parts of the context. If no context matches, provide general expert advice. Keep responses concise (max 6 sentences).' 'system_instruction': (
"You are Ethix, an expert sustainability AI assistant. "
"Your mission is to analyze sustainability reports and provide data-driven insights. "
"You have access to a database of ESG reports (including Georgetown University). "
"CRITICAL INSTRUCTIONS: "
"1. PRIORITIZE CONTEXT: If the provided background context contains relevant data, YOU MUST USE IT and explicitly cite it. "
"2. DETECT GREENWASHING: Be vigilant for vague buzzwords ('eco-friendly', 'green') lacking specific metrics. Flag them. "
"3. ACCURACY: Do not hallucinate data. If the context is empty or irrelevant to the query, state 'I don't have specific data on this in my current reports' before offering general expert knowledge. "
"4. FORMATTING: Use Markdown. Keep answers professional, concise, and structured."
)
} }
) )
return response .text return response .text