BIG WEB UPDATE
This commit is contained in:
211
ai/main.py
Normal file
211
ai/main.py
Normal file
@@ -0,0 +1,211 @@
|
||||
import pandas as pd
|
||||
from pymongo import MongoClient
|
||||
from datetime import datetime
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
import numpy as np
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv('.env.local')
|
||||
|
||||
# MongoDB connection
|
||||
MONGO_URI = os.getenv('MONGO_URI')
|
||||
client = MongoClient(MONGO_URI)
|
||||
db = client['crashes']
|
||||
collection = db['crashes']
|
||||
|
||||
# Read CSV
|
||||
print("Reading CSV file...")
|
||||
df = pd.read_csv('Crashes_in_DC.csv')
|
||||
print(f"Loaded {len(df)} crash records")
|
||||
|
||||
# Helper to calculate severity based on injury data
|
||||
def calculate_severity(row):
|
||||
# Count total injuries and fatalities
|
||||
fatal_count = (
|
||||
row.get('FATAL_BICYCLIST', 0) +
|
||||
row.get('FATAL_DRIVER', 0) +
|
||||
row.get('FATAL_PEDESTRIAN', 0) +
|
||||
row.get('FATALPASSENGER', 0) +
|
||||
row.get('FATALOTHER', 0)
|
||||
)
|
||||
|
||||
major_injury_count = (
|
||||
row.get('MAJORINJURIES_BICYCLIST', 0) +
|
||||
row.get('MAJORINJURIES_DRIVER', 0) +
|
||||
row.get('MAJORINJURIES_PEDESTRIAN', 0) +
|
||||
row.get('MAJORINJURIESPASSENGER', 0) +
|
||||
row.get('MAJORINJURIESOTHER', 0)
|
||||
)
|
||||
|
||||
minor_injury_count = (
|
||||
row.get('MINORINJURIES_BICYCLIST', 0) +
|
||||
row.get('MINORINJURIES_DRIVER', 0) +
|
||||
row.get('MINORINJURIES_PEDESTRIAN', 0) +
|
||||
row.get('MINORINJURIESPASSENGER', 0) +
|
||||
row.get('MINORINJURIESOTHER', 0)
|
||||
)
|
||||
|
||||
if fatal_count > 0:
|
||||
return "Fatal"
|
||||
elif major_injury_count > 0:
|
||||
return "Major Injury"
|
||||
elif minor_injury_count > 0:
|
||||
return "Minor Injury"
|
||||
else:
|
||||
return "Property Damage Only"
|
||||
|
||||
# Helper to convert row to MongoDB document
|
||||
def row_to_doc(row):
|
||||
# Handle missing coordinates
|
||||
longitude = row.get('LONGITUDE')
|
||||
latitude = row.get('LATITUDE')
|
||||
|
||||
# Skip records with invalid coordinates
|
||||
if pd.isna(longitude) or pd.isna(latitude) or longitude == 0 or latitude == 0:
|
||||
return None
|
||||
|
||||
# Parse date
|
||||
report_date = None
|
||||
if pd.notna(row.get('REPORTDATE')):
|
||||
try:
|
||||
report_date = pd.to_datetime(row['REPORTDATE'])
|
||||
except:
|
||||
report_date = None
|
||||
|
||||
# Build the document with GeoJSON location
|
||||
doc = {
|
||||
"crashId": str(row.get('CRIMEID', '')),
|
||||
"ccn": str(row.get('CCN', '')),
|
||||
"reportDate": report_date,
|
||||
"location": {
|
||||
"type": "Point",
|
||||
"coordinates": [float(longitude), float(latitude)] # [longitude, latitude]
|
||||
},
|
||||
"address": str(row.get('ADDRESS', '')),
|
||||
"severity": calculate_severity(row),
|
||||
"ward": str(row.get('WARD', '')),
|
||||
"vehicles": {
|
||||
"total": int(row.get('TOTAL_VEHICLES', 0)),
|
||||
"taxis": int(row.get('TOTAL_TAXIS', 0)),
|
||||
"government": int(row.get('TOTAL_GOVERNMENT', 0))
|
||||
},
|
||||
"casualties": {
|
||||
"bicyclists": {
|
||||
"fatal": int(row.get('FATAL_BICYCLIST', 0)),
|
||||
"major_injuries": int(row.get('MAJORINJURIES_BICYCLIST', 0)),
|
||||
"minor_injuries": int(row.get('MINORINJURIES_BICYCLIST', 0)),
|
||||
"unknown_injuries": int(row.get('UNKNOWNINJURIES_BICYCLIST', 0)),
|
||||
"total": int(row.get('TOTAL_BICYCLES', 0))
|
||||
},
|
||||
"drivers": {
|
||||
"fatal": int(row.get('FATAL_DRIVER', 0)),
|
||||
"major_injuries": int(row.get('MAJORINJURIES_DRIVER', 0)),
|
||||
"minor_injuries": int(row.get('MINORINJURIES_DRIVER', 0)),
|
||||
"unknown_injuries": int(row.get('UNKNOWNINJURIES_DRIVER', 0))
|
||||
},
|
||||
"pedestrians": {
|
||||
"fatal": int(row.get('FATAL_PEDESTRIAN', 0)),
|
||||
"major_injuries": int(row.get('MAJORINJURIES_PEDESTRIAN', 0)),
|
||||
"minor_injuries": int(row.get('MINORINJURIES_PEDESTRIAN', 0)),
|
||||
"unknown_injuries": int(row.get('UNKNOWNINJURIES_PEDESTRIAN', 0)),
|
||||
"total": int(row.get('TOTAL_PEDESTRIANS', 0))
|
||||
},
|
||||
"passengers": {
|
||||
"fatal": int(row.get('FATALPASSENGER', 0)),
|
||||
"major_injuries": int(row.get('MAJORINJURIESPASSENGER', 0)),
|
||||
"minor_injuries": int(row.get('MINORINJURIESPASSENGER', 0)),
|
||||
"unknown_injuries": int(row.get('UNKNOWNINJURIESPASSENGER', 0))
|
||||
}
|
||||
},
|
||||
"circumstances": {
|
||||
"speeding_involved": bool(row.get('SPEEDING_INVOLVED', False)),
|
||||
"pedestrians_impaired": bool(row.get('PEDESTRIANSIMPAIRED', False)),
|
||||
"bicyclists_impaired": bool(row.get('BICYCLISTSIMPAIRED', False)),
|
||||
"drivers_impaired": bool(row.get('DRIVERSIMPAIRED', False))
|
||||
},
|
||||
"location_details": {
|
||||
"nearest_intersection": str(row.get('NEARESTINTSTREETNAME', '')),
|
||||
"off_intersection": bool(row.get('OFFINTERSECTION', False)),
|
||||
"approach_direction": str(row.get('INTAPPROACHDIRECTION', ''))
|
||||
}
|
||||
}
|
||||
|
||||
return doc
|
||||
|
||||
# Convert all rows to documents
|
||||
print("Converting data to MongoDB documents...")
|
||||
docs = []
|
||||
skipped_count = 0
|
||||
|
||||
for _, row in df.iterrows():
|
||||
doc = row_to_doc(row)
|
||||
if doc is not None:
|
||||
docs.append(doc)
|
||||
else:
|
||||
skipped_count += 1
|
||||
|
||||
print(f"Converted {len(docs)} valid documents")
|
||||
print(f"Skipped {skipped_count} records with invalid coordinates")
|
||||
|
||||
# Insert into MongoDB in batches
|
||||
print("Inserting documents into MongoDB...")
|
||||
batch_size = 1000
|
||||
total_inserted = 0
|
||||
|
||||
for i in range(0, len(docs), batch_size):
|
||||
batch = docs[i:i+batch_size]
|
||||
try:
|
||||
result = collection.insert_many(batch, ordered=False)
|
||||
total_inserted += len(result.inserted_ids)
|
||||
print(f"Inserted batch {i//batch_size + 1}/{(len(docs) + batch_size - 1)//batch_size} - Total: {total_inserted}")
|
||||
except Exception as e:
|
||||
print(f"Error inserting batch: {e}")
|
||||
|
||||
print(f"Successfully inserted {total_inserted} documents")
|
||||
|
||||
# Create 2dsphere index for geospatial queries
|
||||
print("Creating 2dsphere index for geospatial queries...")
|
||||
try:
|
||||
collection.create_index([("location", "2dsphere")])
|
||||
print("Successfully created 2dsphere index on 'location' field")
|
||||
except Exception as e:
|
||||
print(f"Error creating index: {e}")
|
||||
|
||||
# Create additional indexes for common queries
|
||||
print("Creating additional indexes...")
|
||||
try:
|
||||
collection.create_index([("severity", 1)])
|
||||
collection.create_index([("reportDate", 1)])
|
||||
collection.create_index([("ward", 1)])
|
||||
print("Successfully created additional indexes")
|
||||
except Exception as e:
|
||||
print(f"Error creating additional indexes: {e}")
|
||||
|
||||
print("Data import completed!")
|
||||
|
||||
# Sample geospatial query to test
|
||||
print("\n--- Testing geospatial query ---")
|
||||
try:
|
||||
# Find crashes within 1000 meters of a point in DC
|
||||
sample_point = [-77.0369, 38.9072] # Washington DC coordinates
|
||||
nearby_crashes = collection.find({
|
||||
"location": {
|
||||
"$nearSphere": {
|
||||
"$geometry": {
|
||||
"type": "Point",
|
||||
"coordinates": sample_point
|
||||
},
|
||||
"$maxDistance": 1000 # 1000 meters
|
||||
}
|
||||
}
|
||||
}).limit(5)
|
||||
|
||||
print(f"Sample query: Found crashes within 1000m of {sample_point}:")
|
||||
for crash in nearby_crashes:
|
||||
print(f" - Crash ID: {crash['crashId']}, Address: {crash['address']}, Severity: {crash['severity']}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error running sample query: {e}")
|
||||
|
||||
client.close()
|
||||
Reference in New Issue
Block a user