BIG WEB UPDATE

This commit is contained in:
2025-09-27 22:45:52 -04:00
parent 6bdd8f0fe3
commit f1073ef3df
17 changed files with 1818 additions and 244 deletions

211
ai/main.py Normal file
View File

@@ -0,0 +1,211 @@
import pandas as pd
from pymongo import MongoClient
from datetime import datetime
import os
from dotenv import load_dotenv
import numpy as np
# Load environment variables
load_dotenv('.env.local')
# MongoDB connection
MONGO_URI = os.getenv('MONGO_URI')
client = MongoClient(MONGO_URI)
db = client['crashes']
collection = db['crashes']
# Read CSV
print("Reading CSV file...")
df = pd.read_csv('Crashes_in_DC.csv')
print(f"Loaded {len(df)} crash records")
# Helper to calculate severity based on injury data
def calculate_severity(row):
# Count total injuries and fatalities
fatal_count = (
row.get('FATAL_BICYCLIST', 0) +
row.get('FATAL_DRIVER', 0) +
row.get('FATAL_PEDESTRIAN', 0) +
row.get('FATALPASSENGER', 0) +
row.get('FATALOTHER', 0)
)
major_injury_count = (
row.get('MAJORINJURIES_BICYCLIST', 0) +
row.get('MAJORINJURIES_DRIVER', 0) +
row.get('MAJORINJURIES_PEDESTRIAN', 0) +
row.get('MAJORINJURIESPASSENGER', 0) +
row.get('MAJORINJURIESOTHER', 0)
)
minor_injury_count = (
row.get('MINORINJURIES_BICYCLIST', 0) +
row.get('MINORINJURIES_DRIVER', 0) +
row.get('MINORINJURIES_PEDESTRIAN', 0) +
row.get('MINORINJURIESPASSENGER', 0) +
row.get('MINORINJURIESOTHER', 0)
)
if fatal_count > 0:
return "Fatal"
elif major_injury_count > 0:
return "Major Injury"
elif minor_injury_count > 0:
return "Minor Injury"
else:
return "Property Damage Only"
# Helper to convert row to MongoDB document
def row_to_doc(row):
# Handle missing coordinates
longitude = row.get('LONGITUDE')
latitude = row.get('LATITUDE')
# Skip records with invalid coordinates
if pd.isna(longitude) or pd.isna(latitude) or longitude == 0 or latitude == 0:
return None
# Parse date
report_date = None
if pd.notna(row.get('REPORTDATE')):
try:
report_date = pd.to_datetime(row['REPORTDATE'])
except:
report_date = None
# Build the document with GeoJSON location
doc = {
"crashId": str(row.get('CRIMEID', '')),
"ccn": str(row.get('CCN', '')),
"reportDate": report_date,
"location": {
"type": "Point",
"coordinates": [float(longitude), float(latitude)] # [longitude, latitude]
},
"address": str(row.get('ADDRESS', '')),
"severity": calculate_severity(row),
"ward": str(row.get('WARD', '')),
"vehicles": {
"total": int(row.get('TOTAL_VEHICLES', 0)),
"taxis": int(row.get('TOTAL_TAXIS', 0)),
"government": int(row.get('TOTAL_GOVERNMENT', 0))
},
"casualties": {
"bicyclists": {
"fatal": int(row.get('FATAL_BICYCLIST', 0)),
"major_injuries": int(row.get('MAJORINJURIES_BICYCLIST', 0)),
"minor_injuries": int(row.get('MINORINJURIES_BICYCLIST', 0)),
"unknown_injuries": int(row.get('UNKNOWNINJURIES_BICYCLIST', 0)),
"total": int(row.get('TOTAL_BICYCLES', 0))
},
"drivers": {
"fatal": int(row.get('FATAL_DRIVER', 0)),
"major_injuries": int(row.get('MAJORINJURIES_DRIVER', 0)),
"minor_injuries": int(row.get('MINORINJURIES_DRIVER', 0)),
"unknown_injuries": int(row.get('UNKNOWNINJURIES_DRIVER', 0))
},
"pedestrians": {
"fatal": int(row.get('FATAL_PEDESTRIAN', 0)),
"major_injuries": int(row.get('MAJORINJURIES_PEDESTRIAN', 0)),
"minor_injuries": int(row.get('MINORINJURIES_PEDESTRIAN', 0)),
"unknown_injuries": int(row.get('UNKNOWNINJURIES_PEDESTRIAN', 0)),
"total": int(row.get('TOTAL_PEDESTRIANS', 0))
},
"passengers": {
"fatal": int(row.get('FATALPASSENGER', 0)),
"major_injuries": int(row.get('MAJORINJURIESPASSENGER', 0)),
"minor_injuries": int(row.get('MINORINJURIESPASSENGER', 0)),
"unknown_injuries": int(row.get('UNKNOWNINJURIESPASSENGER', 0))
}
},
"circumstances": {
"speeding_involved": bool(row.get('SPEEDING_INVOLVED', False)),
"pedestrians_impaired": bool(row.get('PEDESTRIANSIMPAIRED', False)),
"bicyclists_impaired": bool(row.get('BICYCLISTSIMPAIRED', False)),
"drivers_impaired": bool(row.get('DRIVERSIMPAIRED', False))
},
"location_details": {
"nearest_intersection": str(row.get('NEARESTINTSTREETNAME', '')),
"off_intersection": bool(row.get('OFFINTERSECTION', False)),
"approach_direction": str(row.get('INTAPPROACHDIRECTION', ''))
}
}
return doc
# Convert all rows to documents
print("Converting data to MongoDB documents...")
docs = []
skipped_count = 0
for _, row in df.iterrows():
doc = row_to_doc(row)
if doc is not None:
docs.append(doc)
else:
skipped_count += 1
print(f"Converted {len(docs)} valid documents")
print(f"Skipped {skipped_count} records with invalid coordinates")
# Insert into MongoDB in batches
print("Inserting documents into MongoDB...")
batch_size = 1000
total_inserted = 0
for i in range(0, len(docs), batch_size):
batch = docs[i:i+batch_size]
try:
result = collection.insert_many(batch, ordered=False)
total_inserted += len(result.inserted_ids)
print(f"Inserted batch {i//batch_size + 1}/{(len(docs) + batch_size - 1)//batch_size} - Total: {total_inserted}")
except Exception as e:
print(f"Error inserting batch: {e}")
print(f"Successfully inserted {total_inserted} documents")
# Create 2dsphere index for geospatial queries
print("Creating 2dsphere index for geospatial queries...")
try:
collection.create_index([("location", "2dsphere")])
print("Successfully created 2dsphere index on 'location' field")
except Exception as e:
print(f"Error creating index: {e}")
# Create additional indexes for common queries
print("Creating additional indexes...")
try:
collection.create_index([("severity", 1)])
collection.create_index([("reportDate", 1)])
collection.create_index([("ward", 1)])
print("Successfully created additional indexes")
except Exception as e:
print(f"Error creating additional indexes: {e}")
print("Data import completed!")
# Sample geospatial query to test
print("\n--- Testing geospatial query ---")
try:
# Find crashes within 1000 meters of a point in DC
sample_point = [-77.0369, 38.9072] # Washington DC coordinates
nearby_crashes = collection.find({
"location": {
"$nearSphere": {
"$geometry": {
"type": "Point",
"coordinates": sample_point
},
"$maxDistance": 1000 # 1000 meters
}
}
}).limit(5)
print(f"Sample query: Found crashes within 1000m of {sample_point}:")
for crash in nearby_crashes:
print(f" - Crash ID: {crash['crashId']}, Address: {crash['address']}, Severity: {crash['severity']}")
except Exception as e:
print(f"Error running sample query: {e}")
client.close()

116
ai/test_queries.py Normal file
View File

@@ -0,0 +1,116 @@
import os
from pymongo import MongoClient
from dotenv import load_dotenv
# Load environment variables
load_dotenv('.env.local')
# MongoDB connection
MONGO_URI = os.getenv('MONGO_URI')
client = MongoClient(MONGO_URI)
db = client['crashes']
collection = db['crashes']
print("=== MongoDB Geospatial Query Examples ===\n")
# 1. Count total documents
print("1. Total crash records in database:")
total_count = collection.count_documents({})
print(f" {total_count} crash records\n")
# 2. Find crashes within a radius (near the White House)
print("2. Crashes within 500 meters of the White House:")
white_house = [-77.0365, 38.8977]
nearby_crashes = list(collection.find({
"location": {
"$nearSphere": {
"$geometry": {
"type": "Point",
"coordinates": white_house
},
"$maxDistance": 500 # 500 meters
}
}
}).limit(5))
for crash in nearby_crashes:
print(f" - {crash['crashId']}: {crash['address']} (Severity: {crash['severity']})")
print()
# 3. Find crashes within a bounding box (downtown DC area)
print("3. Crashes within downtown DC bounding box:")
downtown_crashes = list(collection.find({
"location": {
"$geoWithin": {
"$box": [
[-77.05, 38.88], # Southwest corner
[-77.01, 38.92] # Northeast corner
]
}
}
}).limit(5))
for crash in downtown_crashes:
print(f" - {crash['crashId']}: {crash['address']} (Ward: {crash['ward']})")
print()
# 4. Aggregation with geoNear for fatal crashes
print("4. Fatal crashes near Capitol Hill (within 1km):")
capitol_hill = [-77.0090, 38.8899]
fatal_nearby = list(collection.aggregate([
{
"$geoNear": {
"near": {
"type": "Point",
"coordinates": capitol_hill
},
"distanceField": "distance",
"maxDistance": 1000,
"query": {"severity": "Fatal"},
"spherical": True
}
},
{"$limit": 3}
]))
for crash in fatal_nearby:
distance_m = round(crash['distance'])
print(f" - {crash['crashId']}: {crash['address']} ({distance_m}m away)")
print()
# 5. Count crashes by severity within a specific area
print("5. Crash severity breakdown in Ward 1:")
severity_breakdown = list(collection.aggregate([
{"$match": {"ward": "Ward 1"}},
{"$group": {"_id": "$severity", "count": {"$sum": 1}}},
{"$sort": {"count": -1}}
]))
for item in severity_breakdown:
print(f" - {item['_id']}: {item['count']} crashes")
print()
# 6. Find crashes involving speeding within a polygon area
print("6. Speeding-involved crashes near DuPont Circle:")
dupont_circle = [-77.0436, 38.9094]
speeding_crashes = list(collection.find({
"location": {
"$nearSphere": {
"$geometry": {
"type": "Point",
"coordinates": dupont_circle
},
"$maxDistance": 800
}
},
"circumstances.speeding_involved": True
}).limit(3))
for crash in speeding_crashes:
print(f" - {crash['crashId']}: {crash['address']}")
print(f" Vehicles: {crash['vehicles']['total']}, Severity: {crash['severity']}")
print()
print("=== Geospatial queries completed successfully! ===")
client.close()