mirror of
https://github.com/SirBlobby/Hoya26.git
synced 2026-02-04 03:34:34 -05:00
AI Prompt and RAG improvements
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
import chromadb
|
import chromadb
|
||||||
import os
|
import os
|
||||||
|
import hashlib
|
||||||
|
|
||||||
CHROMA_HOST = os.environ.get("CHROMA_HOST", "http://chroma.sirblob.co")
|
CHROMA_HOST = os.environ.get("CHROMA_HOST", "http://chroma.sirblob.co")
|
||||||
COLLECTION_NAME = "rag_documents"
|
COLLECTION_NAME = "rag_documents"
|
||||||
@@ -16,26 +17,38 @@ def get_collection (collection_name =COLLECTION_NAME ):
|
|||||||
client =get_chroma_client ()
|
client =get_chroma_client ()
|
||||||
return client .get_or_create_collection (name =collection_name )
|
return client .get_or_create_collection (name =collection_name )
|
||||||
|
|
||||||
def insert_documents (texts ,embeddings ,collection_name =COLLECTION_NAME ,metadata_list =None ):
|
def insert_documents(texts, embeddings, collection_name=COLLECTION_NAME, metadata_list=None):
|
||||||
collection =get_collection (collection_name )
|
collection = get_collection(collection_name)
|
||||||
|
|
||||||
ids =[f"doc_{i }_{hash (text )}"for i ,text in enumerate (texts )]
|
# Generate stable IDs using SHA256
|
||||||
|
ids = [f"doc_{i}_{hashlib.sha256(text.encode('utf-8')).hexdigest()[:16]}" for i, text in enumerate(texts)]
|
||||||
|
|
||||||
if metadata_list :
|
total_count = len(texts)
|
||||||
collection .add (
|
batch_size = 2000
|
||||||
ids =ids ,
|
|
||||||
embeddings =embeddings ,
|
for i in range(0, total_count, batch_size):
|
||||||
documents =texts ,
|
end = min(i + batch_size, total_count)
|
||||||
metadatas =metadata_list
|
|
||||||
)
|
batch_ids = ids[i:end]
|
||||||
else :
|
batch_texts = texts[i:end]
|
||||||
collection .add (
|
batch_embeddings = embeddings[i:end]
|
||||||
ids =ids ,
|
batch_metadatas = metadata_list[i:end] if metadata_list else None
|
||||||
embeddings =embeddings ,
|
|
||||||
documents =texts
|
if batch_metadatas:
|
||||||
)
|
collection.add(
|
||||||
|
ids=batch_ids,
|
||||||
|
embeddings=batch_embeddings,
|
||||||
|
documents=batch_texts,
|
||||||
|
metadatas=batch_metadatas
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
collection.add(
|
||||||
|
ids=batch_ids,
|
||||||
|
embeddings=batch_embeddings,
|
||||||
|
documents=batch_texts
|
||||||
|
)
|
||||||
|
|
||||||
return len (texts )
|
return total_count
|
||||||
|
|
||||||
def search_documents (query_embedding ,collection_name =COLLECTION_NAME ,num_results =5 ,filter_metadata =None ):
|
def search_documents (query_embedding ,collection_name =COLLECTION_NAME ,num_results =5 ,filter_metadata =None ):
|
||||||
collection =get_collection (collection_name )
|
collection =get_collection (collection_name )
|
||||||
|
|||||||
@@ -3,22 +3,34 @@ from google import genai
|
|||||||
from src .chroma .vector_store import search_documents
|
from src .chroma .vector_store import search_documents
|
||||||
from src .rag .embeddings import get_embedding
|
from src .rag .embeddings import get_embedding
|
||||||
|
|
||||||
GREENWASHING_ANALYSIS_PROMPT ="""
|
GREENWASHING_ANALYSIS_PROMPT = """
|
||||||
You are an expert Environmental, Social, and Governance (ESG) Analyst specialized in detecting 'Greenwashing'.
|
You are a Forensics ESG Analyst specialized in detecting Greenwashing.
|
||||||
Your task is to analyze the provided context from a company's data reports and determine if they are engaging in greenwashing.
|
Your objective is to audit the provided company report excerpts and expose any misleading sustainability claims.
|
||||||
|
|
||||||
Greenwashing is defined as making misleading or unsubstantiated claims about the environmental benefits of a product, service, or company practice.
|
### DEFINITION
|
||||||
|
Greenwashing: The practice of making unsubstantiated or misleading claims about the environmental benefits of a product, service, or practice.
|
||||||
|
|
||||||
Please evaluate the following:
|
### ANALYSIS FRAMEWORK
|
||||||
1. Vague Claims: Are they using broad terms like 'eco-friendly' without specific details?
|
Analyze the provided text against these criteria:
|
||||||
2. Lack of Proof: Are claims backed by data, third-party certifications, or specific metrics?
|
1. **Vague Terminology**: Usage of buzzwords ("eco-friendly", "green", "sustainable") without quantifiable definitions.
|
||||||
3. Hidden Trade-offs: Do they highlight one green act while ignoring a much larger environmental harm?
|
2. **No Proof**: Claims lacking specific metrics (e.g., "reduced emissions" vs "reduced CO2 by 15% compared to 2020 baseline").
|
||||||
4. Symbolic Actions: Are they focusing on minor changes while their core business remains highly polluting?
|
3. **Hidden Trade-offs**: Emphasizing a minor eco-feature while ignoring major negative impacts (e.g., recyclable packaging on a toxic product).
|
||||||
|
4. **Irrelevance**: Citing standard compliance (legal requirements) as proactive sustainability achievements.
|
||||||
|
|
||||||
Based on the context provided, give a final verdict:
|
### OUTPUT FORMAT
|
||||||
- VERDICT: [Clear/Suspect/High Risk of Greenwashing]
|
Provide a structured analysis:
|
||||||
- REASONING: [Explain your findings clearly]
|
|
||||||
- EVIDENCE: [Quote specific parts of the context if possible]
|
**Verdict**: [LOW RISK / MODERATE RISK / HIGH RISK / CONFIRMED GREENWASHING]
|
||||||
|
|
||||||
|
**Key Findings**:
|
||||||
|
* [Finding 1]: [Explanation]
|
||||||
|
* [Finding 2]: [Explanation]
|
||||||
|
|
||||||
|
**Evidence**:
|
||||||
|
* "[Quote from text]" -> *Critique of why this is problematic or good.*
|
||||||
|
|
||||||
|
**Conclusion**:
|
||||||
|
A brief 1-2 sentence summary of the brand's honesty regarding this topic.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def ask (prompt ):
|
def ask (prompt ):
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ class GeminiClient :
|
|||||||
raise ValueError ("No GOOGLE_API_KEY found in .env file!")
|
raise ValueError ("No GOOGLE_API_KEY found in .env file!")
|
||||||
|
|
||||||
self .client =genai .Client (api_key =self .api_key )
|
self .client =genai .Client (api_key =self .api_key )
|
||||||
self .model_name ="gemini-2.0-flash"
|
self .model_name ="gemini-3-pro-preview"
|
||||||
|
|
||||||
def ask (self ,prompt ,context =""):
|
def ask (self ,prompt ,context =""):
|
||||||
try :
|
try :
|
||||||
@@ -26,7 +26,16 @@ class GeminiClient :
|
|||||||
model =self .model_name ,
|
model =self .model_name ,
|
||||||
contents =full_message ,
|
contents =full_message ,
|
||||||
config ={
|
config ={
|
||||||
'system_instruction':'You are Ethix, an expert sustainability assistant. You have access to a database including Georgetown University sustainability reports. SEARCH THE PROVIDED CONTEXT CAREFULLY. If the context contains ANY information about Georgetown University or the user\'s query, matches, or partial matches, YOU MUST USE IT to answer. Ignore irrelevant parts of the context. If no context matches, provide general expert advice. Keep responses concise (max 6 sentences).'
|
'system_instruction': (
|
||||||
|
"You are Ethix, an expert sustainability AI assistant. "
|
||||||
|
"Your mission is to analyze sustainability reports and provide data-driven insights. "
|
||||||
|
"You have access to a database of ESG reports (including Georgetown University). "
|
||||||
|
"CRITICAL INSTRUCTIONS: "
|
||||||
|
"1. PRIORITIZE CONTEXT: If the provided background context contains relevant data, YOU MUST USE IT and explicitly cite it. "
|
||||||
|
"2. DETECT GREENWASHING: Be vigilant for vague buzzwords ('eco-friendly', 'green') lacking specific metrics. Flag them. "
|
||||||
|
"3. ACCURACY: Do not hallucinate data. If the context is empty or irrelevant to the query, state 'I don't have specific data on this in my current reports' before offering general expert knowledge. "
|
||||||
|
"4. FORMATTING: Use Markdown. Keep answers professional, concise, and structured."
|
||||||
|
)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
return response .text
|
return response .text
|
||||||
|
|||||||
Reference in New Issue
Block a user