import pandas as pd from pymongo import MongoClient from datetime import datetime import os from dotenv import load_dotenv import numpy as np # Load environment variables load_dotenv('.env.local') # MongoDB connection MONGO_URI = os.getenv('MONGO_URI') client = MongoClient(MONGO_URI) db = client['crashes'] collection = db['crashes'] # Read CSV print("Reading CSV file...") df = pd.read_csv('Crashes_in_DC.csv') print(f"Loaded {len(df)} crash records") # Helper to calculate severity based on injury data def calculate_severity(row): # Count total injuries and fatalities fatal_count = ( row.get('FATAL_BICYCLIST', 0) + row.get('FATAL_DRIVER', 0) + row.get('FATAL_PEDESTRIAN', 0) + row.get('FATALPASSENGER', 0) + row.get('FATALOTHER', 0) ) major_injury_count = ( row.get('MAJORINJURIES_BICYCLIST', 0) + row.get('MAJORINJURIES_DRIVER', 0) + row.get('MAJORINJURIES_PEDESTRIAN', 0) + row.get('MAJORINJURIESPASSENGER', 0) + row.get('MAJORINJURIESOTHER', 0) ) minor_injury_count = ( row.get('MINORINJURIES_BICYCLIST', 0) + row.get('MINORINJURIES_DRIVER', 0) + row.get('MINORINJURIES_PEDESTRIAN', 0) + row.get('MINORINJURIESPASSENGER', 0) + row.get('MINORINJURIESOTHER', 0) ) if fatal_count > 0: return "Fatal" elif major_injury_count > 0: return "Major Injury" elif minor_injury_count > 0: return "Minor Injury" else: return "Property Damage Only" # Helper to convert row to MongoDB document def row_to_doc(row): # Handle missing coordinates longitude = row.get('LONGITUDE') latitude = row.get('LATITUDE') # Skip records with invalid coordinates if pd.isna(longitude) or pd.isna(latitude) or longitude == 0 or latitude == 0: return None # Parse date report_date = None if pd.notna(row.get('REPORTDATE')): try: report_date = pd.to_datetime(row['REPORTDATE']) except: report_date = None # Build the document with GeoJSON location doc = { "crashId": str(row.get('CRIMEID', '')), "ccn": str(row.get('CCN', '')), "reportDate": report_date, "location": { "type": "Point", "coordinates": [float(longitude), float(latitude)] # [longitude, latitude] }, "address": str(row.get('ADDRESS', '')), "severity": calculate_severity(row), "ward": str(row.get('WARD', '')), "vehicles": { "total": int(row.get('TOTAL_VEHICLES', 0)), "taxis": int(row.get('TOTAL_TAXIS', 0)), "government": int(row.get('TOTAL_GOVERNMENT', 0)) }, "casualties": { "bicyclists": { "fatal": int(row.get('FATAL_BICYCLIST', 0)), "major_injuries": int(row.get('MAJORINJURIES_BICYCLIST', 0)), "minor_injuries": int(row.get('MINORINJURIES_BICYCLIST', 0)), "unknown_injuries": int(row.get('UNKNOWNINJURIES_BICYCLIST', 0)), "total": int(row.get('TOTAL_BICYCLES', 0)) }, "drivers": { "fatal": int(row.get('FATAL_DRIVER', 0)), "major_injuries": int(row.get('MAJORINJURIES_DRIVER', 0)), "minor_injuries": int(row.get('MINORINJURIES_DRIVER', 0)), "unknown_injuries": int(row.get('UNKNOWNINJURIES_DRIVER', 0)) }, "pedestrians": { "fatal": int(row.get('FATAL_PEDESTRIAN', 0)), "major_injuries": int(row.get('MAJORINJURIES_PEDESTRIAN', 0)), "minor_injuries": int(row.get('MINORINJURIES_PEDESTRIAN', 0)), "unknown_injuries": int(row.get('UNKNOWNINJURIES_PEDESTRIAN', 0)), "total": int(row.get('TOTAL_PEDESTRIANS', 0)) }, "passengers": { "fatal": int(row.get('FATALPASSENGER', 0)), "major_injuries": int(row.get('MAJORINJURIESPASSENGER', 0)), "minor_injuries": int(row.get('MINORINJURIESPASSENGER', 0)), "unknown_injuries": int(row.get('UNKNOWNINJURIESPASSENGER', 0)) } }, "circumstances": { "speeding_involved": bool(row.get('SPEEDING_INVOLVED', False)), "pedestrians_impaired": bool(row.get('PEDESTRIANSIMPAIRED', False)), "bicyclists_impaired": bool(row.get('BICYCLISTSIMPAIRED', False)), "drivers_impaired": bool(row.get('DRIVERSIMPAIRED', False)) }, "location_details": { "nearest_intersection": str(row.get('NEARESTINTSTREETNAME', '')), "off_intersection": bool(row.get('OFFINTERSECTION', False)), "approach_direction": str(row.get('INTAPPROACHDIRECTION', '')) } } return doc # Convert all rows to documents print("Converting data to MongoDB documents...") docs = [] skipped_count = 0 for _, row in df.iterrows(): doc = row_to_doc(row) if doc is not None: docs.append(doc) else: skipped_count += 1 print(f"Converted {len(docs)} valid documents") print(f"Skipped {skipped_count} records with invalid coordinates") # Insert into MongoDB in batches print("Inserting documents into MongoDB...") batch_size = 1000 total_inserted = 0 for i in range(0, len(docs), batch_size): batch = docs[i:i+batch_size] try: result = collection.insert_many(batch, ordered=False) total_inserted += len(result.inserted_ids) print(f"Inserted batch {i//batch_size + 1}/{(len(docs) + batch_size - 1)//batch_size} - Total: {total_inserted}") except Exception as e: print(f"Error inserting batch: {e}") print(f"Successfully inserted {total_inserted} documents") # Create 2dsphere index for geospatial queries print("Creating 2dsphere index for geospatial queries...") try: collection.create_index([("location", "2dsphere")]) print("Successfully created 2dsphere index on 'location' field") except Exception as e: print(f"Error creating index: {e}") # Create additional indexes for common queries print("Creating additional indexes...") try: collection.create_index([("severity", 1)]) collection.create_index([("reportDate", 1)]) collection.create_index([("ward", 1)]) print("Successfully created additional indexes") except Exception as e: print(f"Error creating additional indexes: {e}") print("Data import completed!") # Sample geospatial query to test print("\n--- Testing geospatial query ---") try: # Find crashes within 1000 meters of a point in DC sample_point = [-77.0369, 38.9072] # Washington DC coordinates nearby_crashes = collection.find({ "location": { "$nearSphere": { "$geometry": { "type": "Point", "coordinates": sample_point }, "$maxDistance": 1000 # 1000 meters } } }).limit(5) print(f"Sample query: Found crashes within 1000m of {sample_point}:") for crash in nearby_crashes: print(f" - Crash ID: {crash['crashId']}, Address: {crash['address']}, Severity: {crash['severity']}") except Exception as e: print(f"Error running sample query: {e}") client.close()