BIG WEB UPDATE

This commit is contained in:
2025-09-27 22:45:52 -04:00
parent 6bdd8f0fe3
commit f1073ef3df
17 changed files with 1818 additions and 244 deletions

211
ai/main.py Normal file
View File

@@ -0,0 +1,211 @@
import pandas as pd
from pymongo import MongoClient
from datetime import datetime
import os
from dotenv import load_dotenv
import numpy as np
# Load environment variables
load_dotenv('.env.local')
# MongoDB connection
MONGO_URI = os.getenv('MONGO_URI')
client = MongoClient(MONGO_URI)
db = client['crashes']
collection = db['crashes']
# Read CSV
print("Reading CSV file...")
df = pd.read_csv('Crashes_in_DC.csv')
print(f"Loaded {len(df)} crash records")
# Helper to calculate severity based on injury data
def calculate_severity(row):
# Count total injuries and fatalities
fatal_count = (
row.get('FATAL_BICYCLIST', 0) +
row.get('FATAL_DRIVER', 0) +
row.get('FATAL_PEDESTRIAN', 0) +
row.get('FATALPASSENGER', 0) +
row.get('FATALOTHER', 0)
)
major_injury_count = (
row.get('MAJORINJURIES_BICYCLIST', 0) +
row.get('MAJORINJURIES_DRIVER', 0) +
row.get('MAJORINJURIES_PEDESTRIAN', 0) +
row.get('MAJORINJURIESPASSENGER', 0) +
row.get('MAJORINJURIESOTHER', 0)
)
minor_injury_count = (
row.get('MINORINJURIES_BICYCLIST', 0) +
row.get('MINORINJURIES_DRIVER', 0) +
row.get('MINORINJURIES_PEDESTRIAN', 0) +
row.get('MINORINJURIESPASSENGER', 0) +
row.get('MINORINJURIESOTHER', 0)
)
if fatal_count > 0:
return "Fatal"
elif major_injury_count > 0:
return "Major Injury"
elif minor_injury_count > 0:
return "Minor Injury"
else:
return "Property Damage Only"
# Helper to convert row to MongoDB document
def row_to_doc(row):
# Handle missing coordinates
longitude = row.get('LONGITUDE')
latitude = row.get('LATITUDE')
# Skip records with invalid coordinates
if pd.isna(longitude) or pd.isna(latitude) or longitude == 0 or latitude == 0:
return None
# Parse date
report_date = None
if pd.notna(row.get('REPORTDATE')):
try:
report_date = pd.to_datetime(row['REPORTDATE'])
except:
report_date = None
# Build the document with GeoJSON location
doc = {
"crashId": str(row.get('CRIMEID', '')),
"ccn": str(row.get('CCN', '')),
"reportDate": report_date,
"location": {
"type": "Point",
"coordinates": [float(longitude), float(latitude)] # [longitude, latitude]
},
"address": str(row.get('ADDRESS', '')),
"severity": calculate_severity(row),
"ward": str(row.get('WARD', '')),
"vehicles": {
"total": int(row.get('TOTAL_VEHICLES', 0)),
"taxis": int(row.get('TOTAL_TAXIS', 0)),
"government": int(row.get('TOTAL_GOVERNMENT', 0))
},
"casualties": {
"bicyclists": {
"fatal": int(row.get('FATAL_BICYCLIST', 0)),
"major_injuries": int(row.get('MAJORINJURIES_BICYCLIST', 0)),
"minor_injuries": int(row.get('MINORINJURIES_BICYCLIST', 0)),
"unknown_injuries": int(row.get('UNKNOWNINJURIES_BICYCLIST', 0)),
"total": int(row.get('TOTAL_BICYCLES', 0))
},
"drivers": {
"fatal": int(row.get('FATAL_DRIVER', 0)),
"major_injuries": int(row.get('MAJORINJURIES_DRIVER', 0)),
"minor_injuries": int(row.get('MINORINJURIES_DRIVER', 0)),
"unknown_injuries": int(row.get('UNKNOWNINJURIES_DRIVER', 0))
},
"pedestrians": {
"fatal": int(row.get('FATAL_PEDESTRIAN', 0)),
"major_injuries": int(row.get('MAJORINJURIES_PEDESTRIAN', 0)),
"minor_injuries": int(row.get('MINORINJURIES_PEDESTRIAN', 0)),
"unknown_injuries": int(row.get('UNKNOWNINJURIES_PEDESTRIAN', 0)),
"total": int(row.get('TOTAL_PEDESTRIANS', 0))
},
"passengers": {
"fatal": int(row.get('FATALPASSENGER', 0)),
"major_injuries": int(row.get('MAJORINJURIESPASSENGER', 0)),
"minor_injuries": int(row.get('MINORINJURIESPASSENGER', 0)),
"unknown_injuries": int(row.get('UNKNOWNINJURIESPASSENGER', 0))
}
},
"circumstances": {
"speeding_involved": bool(row.get('SPEEDING_INVOLVED', False)),
"pedestrians_impaired": bool(row.get('PEDESTRIANSIMPAIRED', False)),
"bicyclists_impaired": bool(row.get('BICYCLISTSIMPAIRED', False)),
"drivers_impaired": bool(row.get('DRIVERSIMPAIRED', False))
},
"location_details": {
"nearest_intersection": str(row.get('NEARESTINTSTREETNAME', '')),
"off_intersection": bool(row.get('OFFINTERSECTION', False)),
"approach_direction": str(row.get('INTAPPROACHDIRECTION', ''))
}
}
return doc
# Convert all rows to documents
print("Converting data to MongoDB documents...")
docs = []
skipped_count = 0
for _, row in df.iterrows():
doc = row_to_doc(row)
if doc is not None:
docs.append(doc)
else:
skipped_count += 1
print(f"Converted {len(docs)} valid documents")
print(f"Skipped {skipped_count} records with invalid coordinates")
# Insert into MongoDB in batches
print("Inserting documents into MongoDB...")
batch_size = 1000
total_inserted = 0
for i in range(0, len(docs), batch_size):
batch = docs[i:i+batch_size]
try:
result = collection.insert_many(batch, ordered=False)
total_inserted += len(result.inserted_ids)
print(f"Inserted batch {i//batch_size + 1}/{(len(docs) + batch_size - 1)//batch_size} - Total: {total_inserted}")
except Exception as e:
print(f"Error inserting batch: {e}")
print(f"Successfully inserted {total_inserted} documents")
# Create 2dsphere index for geospatial queries
print("Creating 2dsphere index for geospatial queries...")
try:
collection.create_index([("location", "2dsphere")])
print("Successfully created 2dsphere index on 'location' field")
except Exception as e:
print(f"Error creating index: {e}")
# Create additional indexes for common queries
print("Creating additional indexes...")
try:
collection.create_index([("severity", 1)])
collection.create_index([("reportDate", 1)])
collection.create_index([("ward", 1)])
print("Successfully created additional indexes")
except Exception as e:
print(f"Error creating additional indexes: {e}")
print("Data import completed!")
# Sample geospatial query to test
print("\n--- Testing geospatial query ---")
try:
# Find crashes within 1000 meters of a point in DC
sample_point = [-77.0369, 38.9072] # Washington DC coordinates
nearby_crashes = collection.find({
"location": {
"$nearSphere": {
"$geometry": {
"type": "Point",
"coordinates": sample_point
},
"$maxDistance": 1000 # 1000 meters
}
}
}).limit(5)
print(f"Sample query: Found crashes within 1000m of {sample_point}:")
for crash in nearby_crashes:
print(f" - Crash ID: {crash['crashId']}, Address: {crash['address']}, Severity: {crash['severity']}")
except Exception as e:
print(f"Error running sample query: {e}")
client.close()