mirror of
https://github.com/SirBlobby/Hoya26.git
synced 2026-02-03 19:24:34 -05:00
Reports Update
This commit is contained in:
@@ -4,4 +4,4 @@ from src import create_app
|
||||
app = create_app()
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(debug=True, port=5000)
|
||||
app.run(debug=True, port=5000, host="0.0.0.0")
|
||||
@@ -11,5 +11,7 @@ def create_app():
|
||||
app.register_blueprint(main_bp)
|
||||
app.register_blueprint(rag_bp, url_prefix='/api/rag')
|
||||
app.register_blueprint(gemini_bp, url_prefix='/api/gemini')
|
||||
from .routes.reports import reports_bp
|
||||
app.register_blueprint(reports_bp, url_prefix='/api/reports')
|
||||
|
||||
return app
|
||||
|
||||
@@ -53,9 +53,11 @@ def search_documents(query_embedding, collection_name=COLLECTION_NAME, num_resul
|
||||
if results and results["documents"]:
|
||||
for i, doc in enumerate(results["documents"][0]):
|
||||
score = results["distances"][0][i] if "distances" in results else None
|
||||
meta = results["metadatas"][0][i] if "metadatas" in results else {}
|
||||
output.append({
|
||||
"text": doc,
|
||||
"score": score
|
||||
"score": score,
|
||||
"metadata": meta
|
||||
})
|
||||
|
||||
return output
|
||||
@@ -67,3 +69,12 @@ def delete_documents_by_source(source_file, collection_name=COLLECTION_NAME):
|
||||
collection.delete(ids=results["ids"])
|
||||
return len(results["ids"])
|
||||
return 0
|
||||
|
||||
def get_all_metadatas(collection_name=COLLECTION_NAME, limit=None):
|
||||
collection = get_collection(collection_name)
|
||||
# Only fetch metadatas to be lightweight
|
||||
if limit:
|
||||
results = collection.get(include=["metadatas"], limit=limit)
|
||||
else:
|
||||
results = collection.get(include=["metadatas"])
|
||||
return results["metadatas"] if results and "metadatas" in results else []
|
||||
|
||||
@@ -3,9 +3,15 @@ import os
|
||||
|
||||
def generate_content(prompt, model_name="gemini-2.0-flash-exp"):
|
||||
api_key = os.environ.get("GOOGLE_API_KEY")
|
||||
client = genai.Client(api_key=api_key)
|
||||
response = client.models.generate_content(
|
||||
model=model_name,
|
||||
contents=prompt,
|
||||
)
|
||||
return response.text
|
||||
if not api_key:
|
||||
return "Error: GOOGLE_API_KEY not found."
|
||||
|
||||
try:
|
||||
client = genai.Client(api_key=api_key)
|
||||
response = client.models.generate_content(
|
||||
model=model_name,
|
||||
contents=prompt,
|
||||
)
|
||||
return response.text
|
||||
except Exception as e:
|
||||
return f"Error interacting with Gemini API: {str(e)}"
|
||||
|
||||
@@ -24,7 +24,10 @@ class GeminiClient:
|
||||
|
||||
response = self.client.models.generate_content(
|
||||
model=self.model_name,
|
||||
contents=full_message
|
||||
contents=full_message,
|
||||
config={
|
||||
'system_instruction': 'You are a concise sustainability assistant. Your responses must be a single short paragraph, maximum 6 sentences long. Do not use bullet points or multiple sections.'
|
||||
}
|
||||
)
|
||||
return response.text
|
||||
|
||||
|
||||
228
backend/src/routes/reports.py
Normal file
228
backend/src/routes/reports.py
Normal file
@@ -0,0 +1,228 @@
|
||||
from flask import Blueprint, jsonify, request
|
||||
from src.chroma.vector_store import get_all_metadatas, search_documents
|
||||
from src.rag.embeddings import get_embedding
|
||||
|
||||
reports_bp = Blueprint('reports', __name__)
|
||||
|
||||
@reports_bp.route('/', methods=['GET'])
|
||||
def get_reports():
|
||||
try:
|
||||
# Fetch all metadatas to ensure we get diversity.
|
||||
# 60k items is manageable for metadata-only fetch.
|
||||
metadatas = get_all_metadatas()
|
||||
|
||||
unique_reports = {}
|
||||
|
||||
for meta in metadatas:
|
||||
filename = meta.get('source') or meta.get('filename')
|
||||
if not filename:
|
||||
continue
|
||||
|
||||
if filename not in unique_reports:
|
||||
# Attempt to extract info from filename
|
||||
# Common patterns:
|
||||
# 2020-tesla-impact-report.pdf
|
||||
# google-2023-environmental-report.pdf
|
||||
# ghgp_data_2021.xlsx
|
||||
|
||||
company_name = "Unknown"
|
||||
year = "N/A"
|
||||
sector = "Other"
|
||||
|
||||
lower_name = filename.lower()
|
||||
|
||||
# Extract Year
|
||||
import re
|
||||
year_match = re.search(r'20\d{2}', lower_name)
|
||||
if year_match:
|
||||
year = year_match.group(0)
|
||||
|
||||
# Extract Company (heuristics)
|
||||
if 'tesla' in lower_name:
|
||||
company_name = "Tesla"
|
||||
sector = "Automotive"
|
||||
elif 'google' in lower_name:
|
||||
company_name = "Google"
|
||||
sector = "Tech"
|
||||
elif 'apple' in lower_name:
|
||||
company_name = "Apple"
|
||||
sector = "Tech"
|
||||
elif 'microsoft' in lower_name:
|
||||
company_name = "Microsoft"
|
||||
sector = "Tech"
|
||||
elif 'amazon' in lower_name:
|
||||
company_name = "Amazon"
|
||||
sector = "Tech"
|
||||
elif 'boeing' in lower_name:
|
||||
company_name = "Boeing"
|
||||
sector = "Aerospace"
|
||||
elif 'ghgp' in lower_name:
|
||||
company_name = "GHGP Data"
|
||||
sector = "Data"
|
||||
elif 'salesforce' in lower_name:
|
||||
company_name = "Salesforce"
|
||||
sector = "Tech"
|
||||
elif 'hp ' in lower_name or 'hp-' in lower_name:
|
||||
company_name = "HP"
|
||||
sector = "Tech"
|
||||
else:
|
||||
# Fallback: capitalize first word of filename
|
||||
parts = re.split(r'[-_.]', filename)
|
||||
if parts:
|
||||
company_name = parts[0].capitalize()
|
||||
if company_name.isdigit(): # If starts with year
|
||||
company_name = parts[1].capitalize() if len(parts) > 1 else "Unknown"
|
||||
|
||||
unique_reports[filename] = {
|
||||
'company_name': company_name,
|
||||
'year': year,
|
||||
'sector': sector,
|
||||
'greenwashing_score': meta.get('greenwashing_score', 0), # Likely 0
|
||||
'filename': filename,
|
||||
'title': f"{company_name} {year} Report"
|
||||
}
|
||||
|
||||
reports_list = list(unique_reports.values())
|
||||
return jsonify(reports_list)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error fetching reports: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return jsonify({'error': str(e)}), 500
|
||||
|
||||
@reports_bp.route('/search', methods=['POST'])
|
||||
def search_reports():
|
||||
data = request.json
|
||||
query = data.get('query', '')
|
||||
|
||||
if not query:
|
||||
return jsonify([])
|
||||
|
||||
try:
|
||||
import re
|
||||
|
||||
# Get embedding for the query
|
||||
query_embedding = get_embedding(query)
|
||||
|
||||
# Search in Chroma - get more results to filter
|
||||
results = search_documents(query_embedding, num_results=50)
|
||||
|
||||
query_lower = query.lower()
|
||||
|
||||
# Helper function to extract company info
|
||||
def extract_company_info(filename):
|
||||
company_name = "Unknown"
|
||||
year = "N/A"
|
||||
sector = "Other"
|
||||
|
||||
lower_name = filename.lower()
|
||||
|
||||
# Extract Year
|
||||
year_match = re.search(r'20\d{2}', lower_name)
|
||||
if year_match:
|
||||
year = year_match.group(0)
|
||||
|
||||
# Extract Company (heuristics)
|
||||
if 'tesla' in lower_name:
|
||||
company_name = "Tesla"
|
||||
sector = "Automotive"
|
||||
elif 'google' in lower_name:
|
||||
company_name = "Google"
|
||||
sector = "Tech"
|
||||
elif 'apple' in lower_name:
|
||||
company_name = "Apple"
|
||||
sector = "Tech"
|
||||
elif 'microsoft' in lower_name:
|
||||
company_name = "Microsoft"
|
||||
sector = "Tech"
|
||||
elif 'amazon' in lower_name:
|
||||
company_name = "Amazon"
|
||||
sector = "Tech"
|
||||
elif 'boeing' in lower_name:
|
||||
company_name = "Boeing"
|
||||
sector = "Aerospace"
|
||||
elif 'ghgp' in lower_name:
|
||||
company_name = "GHGP Data"
|
||||
sector = "Data"
|
||||
elif 'salesforce' in lower_name:
|
||||
company_name = "Salesforce"
|
||||
sector = "Tech"
|
||||
elif 'hp ' in lower_name or 'hp-' in lower_name or lower_name.startswith('hp'):
|
||||
company_name = "HP"
|
||||
sector = "Tech"
|
||||
else:
|
||||
parts = re.split(r'[-_.]', filename)
|
||||
if parts:
|
||||
company_name = parts[0].capitalize()
|
||||
if company_name.isdigit():
|
||||
company_name = parts[1].capitalize() if len(parts) > 1 else "Unknown"
|
||||
|
||||
return company_name, year, sector
|
||||
|
||||
output = []
|
||||
seen_filenames = set()
|
||||
|
||||
for item in results:
|
||||
meta = item.get('metadata', {})
|
||||
text = item.get('text', '')
|
||||
|
||||
filename = meta.get('source') or meta.get('filename', 'Unknown')
|
||||
|
||||
# Skip duplicates
|
||||
if filename in seen_filenames:
|
||||
continue
|
||||
seen_filenames.add(filename)
|
||||
|
||||
company_name, year, sector = extract_company_info(filename)
|
||||
|
||||
# Calculate match score - boost if query matches company/filename
|
||||
match_boost = 0
|
||||
if query_lower in filename.lower():
|
||||
match_boost = 1000 # Strong boost for filename match
|
||||
if query_lower in company_name.lower():
|
||||
match_boost = 1000 # Strong boost for company match
|
||||
|
||||
# Semantic score (inverted distance, higher = better)
|
||||
semantic_score = 1 / (item.get('score', 1) + 0.001) if item.get('score') else 0
|
||||
|
||||
combined_score = match_boost + semantic_score
|
||||
|
||||
# Format snippet
|
||||
snippet = text[:300] + "..." if len(text) > 300 else text
|
||||
|
||||
output.append({
|
||||
'company_name': company_name,
|
||||
'year': year,
|
||||
'filename': filename,
|
||||
'sector': sector,
|
||||
'greenwashing_score': meta.get('greenwashing_score', 0),
|
||||
'snippet': snippet,
|
||||
'relevance_score': item.get('score'),
|
||||
'_combined_score': combined_score
|
||||
})
|
||||
|
||||
# Sort by combined score (descending - higher is better)
|
||||
output.sort(key=lambda x: x.get('_combined_score', 0), reverse=True)
|
||||
|
||||
# Remove internal score field and limit results
|
||||
for item in output:
|
||||
item.pop('_combined_score', None)
|
||||
|
||||
return jsonify(output[:20])
|
||||
except Exception as e:
|
||||
print(f"Error searching reports: {e}")
|
||||
return jsonify({'error': str(e)}), 500
|
||||
|
||||
@reports_bp.route('/view/<path:filename>', methods=['GET'])
|
||||
def view_report_file(filename):
|
||||
import os
|
||||
from flask import send_from_directory
|
||||
|
||||
# Dataset path relative to this file
|
||||
# src/routes/reports.py -> src/routes -> src -> backend -> dataset
|
||||
# So ../../../dataset
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
dataset_dir = os.path.join(current_dir, '..', '..', 'dataset')
|
||||
|
||||
return send_from_directory(dataset_dir, filename)
|
||||
Reference in New Issue
Block a user