Files
VTHacks13/roadcast/data.py
samarthjain2023 0df2b0019b added the model
2025-09-27 12:14:26 -04:00

414 lines
16 KiB
Python

import os
import hashlib
from datetime import datetime
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from PIL import Image
from torchvision import transforms
class ImageFolderDataset(Dataset):
"""A minimal image folder dataset expecting a structure: root/class_name/*.jpg"""
def __init__(self, root, transform=None):
self.root = root
self.samples = [] # list of (path, label)
classes = sorted([d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))])
self.class_to_idx = {c: i for i, c in enumerate(classes)}
for c in classes:
d = os.path.join(root, c)
for fname in os.listdir(d):
if fname.lower().endswith(('.png', '.jpg', '.jpeg')):
self.samples.append((os.path.join(d, fname), self.class_to_idx[c]))
self.transform = transform or transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
])
def __len__(self):
return len(self.samples)
def __getitem__(self, idx):
path, label = self.samples[idx]
img = Image.open(path).convert('RGB')
img = self.transform(img)
return img, label
class CSVDataset(Dataset):
"""Load classification tabular data from a single CSV file.
Expects a `label` column and numeric feature columns. Non-numeric columns are dropped.
"""
def __init__(self, csv_path, feature_columns=None, label_column='label', transform=None, generate_labels=False, n_buckets=100, label_method='md5', label_store=None, feature_engineer=False, lat_lon_bins=20, nrows=None):
# read CSV with low_memory=False to avoid mixed-type warnings
if nrows is None:
self.df = pd.read_csv(csv_path, low_memory=False)
else:
self.df = pd.read_csv(csv_path, nrows=nrows, low_memory=False)
self.label_column = label_column
if generate_labels:
# generate deterministic labels based on selected columns
self.df[self.label_column] = generate_labels_for_df(self.df, n_buckets=n_buckets, method=label_method, label_store=label_store)
# optional simple feature engineering: extract date parts and lat/lon bins
if feature_engineer:
try:
_add_date_features(self.df)
except Exception:
pass
try:
_add_latlon_bins(self.df, bins=lat_lon_bins)
except Exception:
pass
if label_column not in self.df.columns:
raise ValueError(f"label column '{label_column}' not found in CSV; set generate_labels=True to create labels")
# determine feature columns if not provided (numeric columns except label)
if feature_columns is None:
feature_columns = [c for c in self.df.columns if c != label_column and pd.api.types.is_numeric_dtype(self.df[c])]
self.feature_columns = feature_columns
# coerce feature columns to numeric, fill NaNs with column mean (or 0), then standardize
features_df = self.df[self.feature_columns].apply(lambda c: pd.to_numeric(c, errors='coerce'))
# fill NaNs with column mean where possible, otherwise 0
initial_means = features_df.mean()
features_df = features_df.fillna(initial_means).fillna(0.0)
# drop columns that remain all-NaN after coercion/fill (unlikely after fillna(0.0)), to avoid NaNs
all_nan_cols = features_df.columns[features_df.isna().all()].tolist()
if len(all_nan_cols) > 0:
# remove from feature list so indices stay consistent
features_df = features_df.drop(columns=all_nan_cols)
self.feature_columns = [c for c in self.feature_columns if c not in all_nan_cols]
# recompute means/stds from the filled data so subtraction/division won't produce NaNs
col_means = features_df.mean()
col_stds = features_df.std().replace(0, 1.0).fillna(1.0)
# standardize using the recomputed stats
features_df = (features_df - col_means) / (col_stds + 1e-6)
self.feature_means = col_means.to_numpy(dtype=float)
self.feature_stds = col_stds.to_numpy(dtype=float)
self.features = torch.tensor(features_df.values, dtype=torch.float32)
self.labels = torch.tensor(pd.to_numeric(self.df[self.label_column], errors='coerce').fillna(0).astype(int).values, dtype=torch.long)
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
return self.features[idx], int(self.labels[idx])
def _normalize_str(x):
if pd.isna(x):
return ''
return str(x).strip().lower()
def _normalize_date(x):
try:
# try parse common formats
dt = pd.to_datetime(x)
return dt.strftime('%Y-%m-%d')
except Exception:
return ''
def generate_kmeans_labels(df, n_buckets=100, random_state=42, label_store=None):
"""Generate labels by running k-means over numeric features (deterministic with seed).
This produces clusters that are predictable from numeric inputs and are therefore
better suited for training a numeric-feature MLP than arbitrary hash buckets.
"""
# small pure-numpy k-means to avoid external dependency
import numpy as np
# select numeric columns only
num_df = df.select_dtypes(include=['number']).fillna(0.0)
if num_df.shape[0] == 0 or num_df.shape[1] == 0:
# fallback to hashing if no numeric columns
return generate_labels_for_df(df, n_buckets=n_buckets)
data = num_df.values.astype(float)
n_samples = data.shape[0]
rng = np.random.default_rng(random_state)
# If a label_store exists and contains centers, load and use them
import os
if label_store and os.path.exists(label_store):
try:
npz = np.load(label_store)
centers = npz['centers']
all_dists = np.linalg.norm(data[:, None, :] - centers[None, :, :], axis=2)
all_labels = np.argmin(all_dists, axis=1)
return pd.Series(all_labels, index=df.index)
except Exception:
# fall through to fitting
pass
# sample points to fit centers if dataset is large
sample_size = min(20000, n_samples)
if sample_size < n_samples:
idx = rng.choice(n_samples, size=sample_size, replace=False)
sample_data = data[idx]
else:
sample_data = data
# initialize centers by random sampling from sample_data
centers_idx = rng.choice(sample_data.shape[0], size=min(n_buckets, sample_data.shape[0]), replace=False)
centers = sample_data[centers_idx].astype(float)
# run a small number of iterations
max_iters = 10
for _ in range(max_iters):
# assign
dists = np.linalg.norm(sample_data[:, None, :] - centers[None, :, :], axis=2)
labels = np.argmin(dists, axis=1)
# recompute centers
new_centers = np.zeros_like(centers)
counts = np.zeros((centers.shape[0],), dtype=int)
for i, lab in enumerate(labels):
new_centers[lab] += sample_data[i]
counts[lab] += 1
for k in range(centers.shape[0]):
if counts[k] > 0:
new_centers[k] = new_centers[k] / counts[k]
else:
# reinitialize empty cluster
new_centers[k] = sample_data[rng.integers(0, sample_data.shape[0])]
# check convergence (centers change small)
shift = np.linalg.norm(new_centers - centers, axis=1).max()
centers = new_centers
if shift < 1e-4:
break
# assign labels for all data
all_dists = np.linalg.norm(data[:, None, :] - centers[None, :, :], axis=2)
all_labels = np.argmin(all_dists, axis=1)
# persist centers if requested
if label_store:
try:
np.savez_compressed(label_store, centers=centers)
except Exception:
pass
return pd.Series(all_labels, index=df.index)
def generate_labels_for_df(df, n_buckets=100, method='md5', label_store=None):
"""Generate deterministic bucket labels 1..n_buckets from rows using selected columns.
Uses: report_dat, latitude, longitude, street1, street2, ward, injuries, fatalities.
Produces reproducible labels via md5 hashing of a normalized feature string.
"""
if method == 'kmeans':
return generate_kmeans_labels(df, n_buckets=n_buckets, label_store=label_store)
# Be flexible about column names (case variations and alternate names).
colmap = {c.lower(): c for c in df.columns}
def get_col(*candidates):
for cand in candidates:
key = cand.lower()
if key in colmap:
return colmap[key]
return None
report_col = get_col('report_dat', 'reportdate', 'fromdate', 'lastupdatedate')
lat_col = get_col('latitude', 'mpdlatitude', 'lat')
lon_col = get_col('longitude', 'mpdlongitude', 'lon')
street1_col = get_col('street1', 'address', 'mar_address', 'nearestintstreetname')
street2_col = get_col('street2', 'nearestintstreetname')
ward_col = get_col('ward')
inj_cols = [c for c in df.columns if 'INJUR' in c.upper()]
fat_cols = [c for c in df.columns if 'FATAL' in c.upper()]
uid = get_col('crimeid', 'eventid', 'objectid', 'ccn')
def row_to_bucket(row):
parts = []
# date
parts.append(_normalize_date(row.get(report_col, '') if report_col else ''))
# lat/lon rounded
lat = row.get(lat_col, '') if lat_col else ''
lon = row.get(lon_col, '') if lon_col else ''
try:
parts.append(str(round(float(lat), 5)) if pd.notna(lat) and lat != '' else '')
except Exception:
parts.append('')
try:
parts.append(str(round(float(lon), 5)) if pd.notna(lon) and lon != '' else '')
except Exception:
parts.append('')
# streets and ward
parts.append(_normalize_str(row.get(street1_col, '') if street1_col else ''))
parts.append(_normalize_str(row.get(street2_col, '') if street2_col else ''))
parts.append(_normalize_str(row.get(ward_col, '') if ward_col else ''))
# injuries: sum any injury-like columns
inj_sum = 0
for c in inj_cols:
try:
v = row.get(c, 0)
inj_sum += int(v) if pd.notna(v) and v != '' else 0
except Exception:
pass
parts.append(str(inj_sum))
# fatalities: sum any fatal-like columns
fat_sum = 0
for c in fat_cols:
try:
v = row.get(c, 0)
fat_sum += int(v) if pd.notna(v) and v != '' else 0
except Exception:
pass
parts.append(str(fat_sum))
# fallback uid
if uid:
parts.append(str(row.get(uid, '')))
s = '|'.join(parts)
h = hashlib.md5(s.encode('utf-8')).hexdigest()
val = int(h, 16) % n_buckets
return val
return df.apply(row_to_bucket, axis=1)
def _add_date_features(df, date_col_candidates=None):
"""Add simple date-derived numeric columns to the dataframe.
Adds: report_year, report_month, report_day, report_weekday, report_hour (where available).
If no date column is found, function is a no-op.
"""
if date_col_candidates is None:
date_col_candidates = ['report_dat', 'reportdate', 'fromdate', 'lastupdatedate', 'date', 'occur_date']
colmap = {c.lower(): c for c in df.columns}
date_col = None
for cand in date_col_candidates:
if cand.lower() in colmap:
date_col = colmap[cand.lower()]
break
if date_col is None:
return
try:
ser = pd.to_datetime(df[date_col], errors='coerce')
except Exception:
ser = pd.to_datetime(df[date_col].astype(str), errors='coerce')
df['report_year'] = ser.dt.year.fillna(-1).astype(float)
df['report_month'] = ser.dt.month.fillna(-1).astype(float)
df['report_day'] = ser.dt.day.fillna(-1).astype(float)
df['report_weekday'] = ser.dt.weekday.fillna(-1).astype(float)
# hour may not exist; if parsing fails we'll get NaN
df['report_hour'] = ser.dt.hour.fillna(-1).astype(float)
def _add_hashed_street(df, n_hash_buckets=32, street_col_candidates=None):
"""Add a small hashed numeric feature for street/address text fields.
Adds `street_hash_0..N-1` as dense float columns containing one-hot-ish hashed values.
Uses MD5-based hashing reduced to a bucket and then maps to a small integer vector.
"""
if street_col_candidates is None:
street_col_candidates = ['street1', 'street', 'address', 'mar_address', 'nearestintstreetname']
colmap = {c.lower(): c for c in df.columns}
street_col = None
for cand in street_col_candidates:
if cand.lower() in colmap:
street_col = colmap[cand.lower()]
break
if street_col is None:
return
import hashlib
# create a single integer hash bucket per row
def row_hash(val):
if pd.isna(val) or str(val).strip() == '':
return -1
h = hashlib.md5(str(val).encode('utf-8')).hexdigest()
return int(h, 16) % n_hash_buckets
buckets = df[street_col].apply(row_hash).fillna(-1).astype(int).to_numpy()
# create N numeric columns with a one-hot style (0/1) encoded as floats; missing bucket => zeros
for i in range(n_hash_buckets):
colname = f'street_hash_{i}'
df[colname] = (buckets == i).astype(float)
def _add_latlon_bins(df, bins=20, lat_col_candidates=None, lon_col_candidates=None):
"""Add coarse spatial bins for latitude/longitude and rounded lat/lon numeric features.
Adds: lat_round, lon_round, lat_bin, lon_bin (bins numbered 0..bins-1, -1 for missing).
"""
if lat_col_candidates is None:
lat_col_candidates = ['latitude', 'mpdlatitude', 'lat']
if lon_col_candidates is None:
lon_col_candidates = ['longitude', 'mpdlongitude', 'lon']
colmap = {c.lower(): c for c in df.columns}
lat_col = None
lon_col = None
for cand in lat_col_candidates:
if cand.lower() in colmap:
lat_col = colmap[cand.lower()]
break
for cand in lon_col_candidates:
if cand.lower() in colmap:
lon_col = colmap[cand.lower()]
break
if lat_col is None or lon_col is None:
return
try:
lat = pd.to_numeric(df[lat_col], errors='coerce')
lon = pd.to_numeric(df[lon_col], errors='coerce')
except Exception:
lat = pd.to_numeric(df[lat_col].astype(str), errors='coerce')
lon = pd.to_numeric(df[lon_col].astype(str), errors='coerce')
df['lat_round'] = lat.round(3).fillna(0.0).astype(float)
df['lon_round'] = lon.round(3).fillna(0.0).astype(float)
try:
# compute bins using quantiles if possible to get balanced bins; fallback to linear bins
valid_lat = lat.dropna()
valid_lon = lon.dropna()
if len(valid_lat) >= bins and len(valid_lon) >= bins:
# qcut may produce NaNs for duplicates; use rank-based discretization
df['lat_bin'] = pd.qcut(lat.rank(method='first'), q=bins, labels=False, duplicates='drop')
df['lon_bin'] = pd.qcut(lon.rank(method='first'), q=bins, labels=False, duplicates='drop')
else:
lat_min, lat_max = valid_lat.min() if len(valid_lat) > 0 else 0.0, valid_lat.max() if len(valid_lat) > 0 else 0.0
lon_min, lon_max = valid_lon.min() if len(valid_lon) > 0 else 0.0, valid_lon.max() if len(valid_lon) > 0 else 0.0
lat_span = (lat_max - lat_min) + 1e-6
lon_span = (lon_max - lon_min) + 1e-6
df['lat_bin'] = (((lat - lat_min) / lat_span) * bins).fillna(-1).astype(int).clip(lower=-1, upper=bins-1)
df['lon_bin'] = (((lon - lon_min) / lon_span) * bins).fillna(-1).astype(int).clip(lower=-1, upper=bins-1)
except Exception:
# fallback: set -1 for bins
df['lat_bin'] = -1
df['lon_bin'] = -1
# Debugging code - to be removed or commented out in production
# python - <<'PY'
# import pandas as pd
# from data import generate_labels_for_df
# df = pd.read_csv('data.csv', nrows=50, low_memory=False)
# labs = generate_labels_for_df(df, n_buckets=100)
# print(df[['REPORTDATE','LATITUDE','LONGITUDE','ADDRESS','WARD']].head().to_string())
# print('labels:', list(labs[:20]))
# PY
# Command to run the training (to be executed in the terminal, not in the script)
# python train.py data.csv --model-type mlp --generate-labels --n-buckets 100 --epochs 5 --batch-size 256 --lr 1e-3