Reports Update

This commit is contained in:
2026-01-25 00:16:30 +00:00
parent 87df89fb32
commit d37d925150
11 changed files with 1153 additions and 254 deletions

View File

@@ -4,4 +4,4 @@ from src import create_app
app = create_app()
if __name__ == "__main__":
app.run(debug=True, port=5000)
app.run(debug=True, port=5000, host="0.0.0.0")

View File

@@ -11,5 +11,7 @@ def create_app():
app.register_blueprint(main_bp)
app.register_blueprint(rag_bp, url_prefix='/api/rag')
app.register_blueprint(gemini_bp, url_prefix='/api/gemini')
from .routes.reports import reports_bp
app.register_blueprint(reports_bp, url_prefix='/api/reports')
return app

View File

@@ -53,9 +53,11 @@ def search_documents(query_embedding, collection_name=COLLECTION_NAME, num_resul
if results and results["documents"]:
for i, doc in enumerate(results["documents"][0]):
score = results["distances"][0][i] if "distances" in results else None
meta = results["metadatas"][0][i] if "metadatas" in results else {}
output.append({
"text": doc,
"score": score
"score": score,
"metadata": meta
})
return output
@@ -67,3 +69,12 @@ def delete_documents_by_source(source_file, collection_name=COLLECTION_NAME):
collection.delete(ids=results["ids"])
return len(results["ids"])
return 0
def get_all_metadatas(collection_name=COLLECTION_NAME, limit=None):
collection = get_collection(collection_name)
# Only fetch metadatas to be lightweight
if limit:
results = collection.get(include=["metadatas"], limit=limit)
else:
results = collection.get(include=["metadatas"])
return results["metadatas"] if results and "metadatas" in results else []

View File

@@ -3,9 +3,15 @@ import os
def generate_content(prompt, model_name="gemini-2.0-flash-exp"):
api_key = os.environ.get("GOOGLE_API_KEY")
client = genai.Client(api_key=api_key)
response = client.models.generate_content(
model=model_name,
contents=prompt,
)
return response.text
if not api_key:
return "Error: GOOGLE_API_KEY not found."
try:
client = genai.Client(api_key=api_key)
response = client.models.generate_content(
model=model_name,
contents=prompt,
)
return response.text
except Exception as e:
return f"Error interacting with Gemini API: {str(e)}"

View File

@@ -24,7 +24,10 @@ class GeminiClient:
response = self.client.models.generate_content(
model=self.model_name,
contents=full_message
contents=full_message,
config={
'system_instruction': 'You are a concise sustainability assistant. Your responses must be a single short paragraph, maximum 6 sentences long. Do not use bullet points or multiple sections.'
}
)
return response.text

View File

@@ -0,0 +1,228 @@
from flask import Blueprint, jsonify, request
from src.chroma.vector_store import get_all_metadatas, search_documents
from src.rag.embeddings import get_embedding
reports_bp = Blueprint('reports', __name__)
@reports_bp.route('/', methods=['GET'])
def get_reports():
try:
# Fetch all metadatas to ensure we get diversity.
# 60k items is manageable for metadata-only fetch.
metadatas = get_all_metadatas()
unique_reports = {}
for meta in metadatas:
filename = meta.get('source') or meta.get('filename')
if not filename:
continue
if filename not in unique_reports:
# Attempt to extract info from filename
# Common patterns:
# 2020-tesla-impact-report.pdf
# google-2023-environmental-report.pdf
# ghgp_data_2021.xlsx
company_name = "Unknown"
year = "N/A"
sector = "Other"
lower_name = filename.lower()
# Extract Year
import re
year_match = re.search(r'20\d{2}', lower_name)
if year_match:
year = year_match.group(0)
# Extract Company (heuristics)
if 'tesla' in lower_name:
company_name = "Tesla"
sector = "Automotive"
elif 'google' in lower_name:
company_name = "Google"
sector = "Tech"
elif 'apple' in lower_name:
company_name = "Apple"
sector = "Tech"
elif 'microsoft' in lower_name:
company_name = "Microsoft"
sector = "Tech"
elif 'amazon' in lower_name:
company_name = "Amazon"
sector = "Tech"
elif 'boeing' in lower_name:
company_name = "Boeing"
sector = "Aerospace"
elif 'ghgp' in lower_name:
company_name = "GHGP Data"
sector = "Data"
elif 'salesforce' in lower_name:
company_name = "Salesforce"
sector = "Tech"
elif 'hp ' in lower_name or 'hp-' in lower_name:
company_name = "HP"
sector = "Tech"
else:
# Fallback: capitalize first word of filename
parts = re.split(r'[-_.]', filename)
if parts:
company_name = parts[0].capitalize()
if company_name.isdigit(): # If starts with year
company_name = parts[1].capitalize() if len(parts) > 1 else "Unknown"
unique_reports[filename] = {
'company_name': company_name,
'year': year,
'sector': sector,
'greenwashing_score': meta.get('greenwashing_score', 0), # Likely 0
'filename': filename,
'title': f"{company_name} {year} Report"
}
reports_list = list(unique_reports.values())
return jsonify(reports_list)
except Exception as e:
print(f"Error fetching reports: {e}")
import traceback
traceback.print_exc()
return jsonify({'error': str(e)}), 500
@reports_bp.route('/search', methods=['POST'])
def search_reports():
data = request.json
query = data.get('query', '')
if not query:
return jsonify([])
try:
import re
# Get embedding for the query
query_embedding = get_embedding(query)
# Search in Chroma - get more results to filter
results = search_documents(query_embedding, num_results=50)
query_lower = query.lower()
# Helper function to extract company info
def extract_company_info(filename):
company_name = "Unknown"
year = "N/A"
sector = "Other"
lower_name = filename.lower()
# Extract Year
year_match = re.search(r'20\d{2}', lower_name)
if year_match:
year = year_match.group(0)
# Extract Company (heuristics)
if 'tesla' in lower_name:
company_name = "Tesla"
sector = "Automotive"
elif 'google' in lower_name:
company_name = "Google"
sector = "Tech"
elif 'apple' in lower_name:
company_name = "Apple"
sector = "Tech"
elif 'microsoft' in lower_name:
company_name = "Microsoft"
sector = "Tech"
elif 'amazon' in lower_name:
company_name = "Amazon"
sector = "Tech"
elif 'boeing' in lower_name:
company_name = "Boeing"
sector = "Aerospace"
elif 'ghgp' in lower_name:
company_name = "GHGP Data"
sector = "Data"
elif 'salesforce' in lower_name:
company_name = "Salesforce"
sector = "Tech"
elif 'hp ' in lower_name or 'hp-' in lower_name or lower_name.startswith('hp'):
company_name = "HP"
sector = "Tech"
else:
parts = re.split(r'[-_.]', filename)
if parts:
company_name = parts[0].capitalize()
if company_name.isdigit():
company_name = parts[1].capitalize() if len(parts) > 1 else "Unknown"
return company_name, year, sector
output = []
seen_filenames = set()
for item in results:
meta = item.get('metadata', {})
text = item.get('text', '')
filename = meta.get('source') or meta.get('filename', 'Unknown')
# Skip duplicates
if filename in seen_filenames:
continue
seen_filenames.add(filename)
company_name, year, sector = extract_company_info(filename)
# Calculate match score - boost if query matches company/filename
match_boost = 0
if query_lower in filename.lower():
match_boost = 1000 # Strong boost for filename match
if query_lower in company_name.lower():
match_boost = 1000 # Strong boost for company match
# Semantic score (inverted distance, higher = better)
semantic_score = 1 / (item.get('score', 1) + 0.001) if item.get('score') else 0
combined_score = match_boost + semantic_score
# Format snippet
snippet = text[:300] + "..." if len(text) > 300 else text
output.append({
'company_name': company_name,
'year': year,
'filename': filename,
'sector': sector,
'greenwashing_score': meta.get('greenwashing_score', 0),
'snippet': snippet,
'relevance_score': item.get('score'),
'_combined_score': combined_score
})
# Sort by combined score (descending - higher is better)
output.sort(key=lambda x: x.get('_combined_score', 0), reverse=True)
# Remove internal score field and limit results
for item in output:
item.pop('_combined_score', None)
return jsonify(output[:20])
except Exception as e:
print(f"Error searching reports: {e}")
return jsonify({'error': str(e)}), 500
@reports_bp.route('/view/<path:filename>', methods=['GET'])
def view_report_file(filename):
import os
from flask import send_from_directory
# Dataset path relative to this file
# src/routes/reports.py -> src/routes -> src -> backend -> dataset
# So ../../../dataset
current_dir = os.path.dirname(os.path.abspath(__file__))
dataset_dir = os.path.join(current_dir, '..', '..', 'dataset')
return send_from_directory(dataset_dir, filename)