VTHacks13/roadcast/data.py

import os
import hashlib
from datetime import datetime
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from PIL import Image
from torchvision import transforms


class ImageFolderDataset(Dataset):
    """A minimal image folder dataset expecting a structure: root/class_name/*.jpg"""
    def __init__(self, root, transform=None):
        self.root = root
        self.samples = []  # list of (path, label)
        classes = sorted([d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))])
        self.class_to_idx = {c: i for i, c in enumerate(classes)}
        for c in classes:
            d = os.path.join(root, c)
            for fname in os.listdir(d):
                if fname.lower().endswith(('.png', '.jpg', '.jpeg')):
                    self.samples.append((os.path.join(d, fname), self.class_to_idx[c]))

        self.transform = transform or transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
        ])

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]
        img = Image.open(path).convert('RGB')
        img = self.transform(img)
        return img, label


class CSVDataset(Dataset):
    """Load classification tabular data from a single CSV file.

    Expects a `label` column and numeric feature columns. Non-numeric columns are dropped.
    """
    def __init__(self, csv_path, feature_columns=None, label_column='label', transform=None, generate_labels=False, n_buckets=100, label_method='md5', label_store=None, feature_engineer=False, lat_lon_bins=20, nrows=None):
        # read CSV with low_memory=False to avoid mixed-type warnings
        if nrows is None:
            self.df = pd.read_csv(csv_path, low_memory=False)
        else:
            self.df = pd.read_csv(csv_path, nrows=nrows, low_memory=False)
        self.label_column = label_column

        if generate_labels:
            # generate deterministic labels based on selected columns
            self.df[self.label_column] = generate_labels_for_df(self.df, n_buckets=n_buckets, method=label_method, label_store=label_store)

        # optional simple feature engineering: extract date parts and lat/lon bins
        if feature_engineer:
            try:
                _add_date_features(self.df)
            except Exception:
                pass
            try:
                _add_latlon_bins(self.df, bins=lat_lon_bins)
            except Exception:
                pass

        if label_column not in self.df.columns:
            raise ValueError(f"label column '{label_column}' not found in CSV; set generate_labels=True to create labels")

        # determine feature columns if not provided (numeric columns except label)
        if feature_columns is None:
            feature_columns = [c for c in self.df.columns if c != label_column and pd.api.types.is_numeric_dtype(self.df[c])]
        self.feature_columns = feature_columns
        # coerce feature columns to numeric, fill NaNs with column mean (or 0), then standardize
        features_df = self.df[self.feature_columns].apply(lambda c: pd.to_numeric(c, errors='coerce'))
        # fill NaNs with column mean where possible, otherwise 0
        initial_means = features_df.mean()
        features_df = features_df.fillna(initial_means).fillna(0.0)

        # drop columns that remain all-NaN after coercion/fill (unlikely after fillna(0.0)), to avoid NaNs
        all_nan_cols = features_df.columns[features_df.isna().all()].tolist()
        if len(all_nan_cols) > 0:
            # remove from feature list so indices stay consistent
            features_df = features_df.drop(columns=all_nan_cols)
            self.feature_columns = [c for c in self.feature_columns if c not in all_nan_cols]

        # recompute means/stds from the filled data so subtraction/division won't produce NaNs
        col_means = features_df.mean()
        col_stds = features_df.std().replace(0, 1.0).fillna(1.0)

        # standardize using the recomputed stats
        features_df = (features_df - col_means) / (col_stds + 1e-6)

        self.feature_means = col_means.to_numpy(dtype=float)
        self.feature_stds = col_stds.to_numpy(dtype=float)

        self.features = torch.tensor(features_df.values, dtype=torch.float32)
        self.labels = torch.tensor(pd.to_numeric(self.df[self.label_column], errors='coerce').fillna(0).astype(int).values, dtype=torch.long)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.features[idx], int(self.labels[idx])


def _normalize_str(x):
    if pd.isna(x):
        return ''
    return str(x).strip().lower()


def _normalize_date(x):
    try:
        # try parse common formats
        dt = pd.to_datetime(x)
        return dt.strftime('%Y-%m-%d')
    except Exception:
        return ''


def generate_kmeans_labels(df, n_buckets=100, random_state=42, label_store=None):
    """Generate labels by running k-means over numeric features (deterministic with seed).

    This produces clusters that are predictable from numeric inputs and are therefore
    better suited for training a numeric-feature MLP than arbitrary hash buckets.
    """
    # small pure-numpy k-means to avoid external dependency
    import numpy as np

    # select numeric columns only
    num_df = df.select_dtypes(include=['number']).fillna(0.0)
    if num_df.shape[0] == 0 or num_df.shape[1] == 0:
        # fallback to hashing if no numeric columns
        return generate_labels_for_df(df, n_buckets=n_buckets)

    data = num_df.values.astype(float)
    n_samples = data.shape[0]
    rng = np.random.default_rng(random_state)

    # If a label_store exists and contains centers, load and use them
    import os
    if label_store and os.path.exists(label_store):
        try:
            npz = np.load(label_store)
            centers = npz['centers']
            all_dists = np.linalg.norm(data[:, None, :] - centers[None, :, :], axis=2)
            all_labels = np.argmin(all_dists, axis=1)
            return pd.Series(all_labels, index=df.index)
        except Exception:
            # fall through to fitting
            pass

    # sample points to fit centers if dataset is large
    sample_size = min(20000, n_samples)
    if sample_size < n_samples:
        idx = rng.choice(n_samples, size=sample_size, replace=False)
        sample_data = data[idx]
    else:
        sample_data = data

    # initialize centers by random sampling from sample_data
    centers_idx = rng.choice(sample_data.shape[0], size=min(n_buckets, sample_data.shape[0]), replace=False)
    centers = sample_data[centers_idx].astype(float)

    # run a small number of iterations
    max_iters = 10
    for _ in range(max_iters):
        # assign
        dists = np.linalg.norm(sample_data[:, None, :] - centers[None, :, :], axis=2)
        labels = np.argmin(dists, axis=1)
        # recompute centers
        new_centers = np.zeros_like(centers)
        counts = np.zeros((centers.shape[0],), dtype=int)
        for i, lab in enumerate(labels):
            new_centers[lab] += sample_data[i]
            counts[lab] += 1
        for k in range(centers.shape[0]):
            if counts[k] > 0:
                new_centers[k] = new_centers[k] / counts[k]
            else:
                # reinitialize empty cluster
                new_centers[k] = sample_data[rng.integers(0, sample_data.shape[0])]
        # check convergence (centers change small)
        shift = np.linalg.norm(new_centers - centers, axis=1).max()
        centers = new_centers
        if shift < 1e-4:
            break

    # assign labels for all data
    all_dists = np.linalg.norm(data[:, None, :] - centers[None, :, :], axis=2)
    all_labels = np.argmin(all_dists, axis=1)
    # persist centers if requested
    if label_store:
        try:
            np.savez_compressed(label_store, centers=centers)
        except Exception:
            pass
    return pd.Series(all_labels, index=df.index)


def generate_labels_for_df(df, n_buckets=100, method='md5', label_store=None):
    """Generate deterministic bucket labels 1..n_buckets from rows using selected columns.

    Uses: report_dat, latitude, longitude, street1, street2, ward, injuries, fatalities.
    Produces reproducible labels via md5 hashing of a normalized feature string.
    """
    if method == 'kmeans':
        return generate_kmeans_labels(df, n_buckets=n_buckets, label_store=label_store)

    # Be flexible about column names (case variations and alternate names).
    colmap = {c.lower(): c for c in df.columns}

    def get_col(*candidates):
        for cand in candidates:
            key = cand.lower()
            if key in colmap:
                return colmap[key]
        return None

    report_col = get_col('report_dat', 'reportdate', 'fromdate', 'lastupdatedate')
    lat_col = get_col('latitude', 'mpdlatitude', 'lat')
    lon_col = get_col('longitude', 'mpdlongitude', 'lon')
    street1_col = get_col('street1', 'address', 'mar_address', 'nearestintstreetname')
    street2_col = get_col('street2', 'nearestintstreetname')
    ward_col = get_col('ward')

    inj_cols = [c for c in df.columns if 'INJUR' in c.upper()]
    fat_cols = [c for c in df.columns if 'FATAL' in c.upper()]

    uid = get_col('crimeid', 'eventid', 'objectid', 'ccn')

    def row_to_bucket(row):
        parts = []
        # date
        parts.append(_normalize_date(row.get(report_col, '') if report_col else ''))
        # lat/lon rounded
        lat = row.get(lat_col, '') if lat_col else ''
        lon = row.get(lon_col, '') if lon_col else ''
        try:
            parts.append(str(round(float(lat), 5)) if pd.notna(lat) and lat != '' else '')
        except Exception:
            parts.append('')
        try:
            parts.append(str(round(float(lon), 5)) if pd.notna(lon) and lon != '' else '')
        except Exception:
            parts.append('')

        # streets and ward
        parts.append(_normalize_str(row.get(street1_col, '') if street1_col else ''))
        parts.append(_normalize_str(row.get(street2_col, '') if street2_col else ''))
        parts.append(_normalize_str(row.get(ward_col, '') if ward_col else ''))

        # injuries: sum any injury-like columns
        inj_sum = 0
        for c in inj_cols:
            try:
                v = row.get(c, 0)
                inj_sum += int(v) if pd.notna(v) and v != '' else 0
            except Exception:
                pass
        parts.append(str(inj_sum))

        # fatalities: sum any fatal-like columns
        fat_sum = 0
        for c in fat_cols:
            try:
                v = row.get(c, 0)
                fat_sum += int(v) if pd.notna(v) and v != '' else 0
            except Exception:
                pass
        parts.append(str(fat_sum))

        # fallback uid
        if uid:
            parts.append(str(row.get(uid, '')))

        s = '|'.join(parts)
        h = hashlib.md5(s.encode('utf-8')).hexdigest()
        val = int(h, 16) % n_buckets
        return val

    return df.apply(row_to_bucket, axis=1)


def _add_date_features(df, date_col_candidates=None):
    """Add simple date-derived numeric columns to the dataframe.

    Adds: report_year, report_month, report_day, report_weekday, report_hour (where available).
    If no date column is found, function is a no-op.
    """
    if date_col_candidates is None:
        date_col_candidates = ['report_dat', 'reportdate', 'fromdate', 'lastupdatedate', 'date', 'occur_date']
    colmap = {c.lower(): c for c in df.columns}
    date_col = None
    for cand in date_col_candidates:
        if cand.lower() in colmap:
            date_col = colmap[cand.lower()]
            break
    if date_col is None:
        return
    try:
        ser = pd.to_datetime(df[date_col], errors='coerce')
    except Exception:
        ser = pd.to_datetime(df[date_col].astype(str), errors='coerce')

    df['report_year'] = ser.dt.year.fillna(-1).astype(float)
    df['report_month'] = ser.dt.month.fillna(-1).astype(float)
    df['report_day'] = ser.dt.day.fillna(-1).astype(float)
    df['report_weekday'] = ser.dt.weekday.fillna(-1).astype(float)
    # hour may not exist; if parsing fails we'll get NaN
    df['report_hour'] = ser.dt.hour.fillna(-1).astype(float)


def _add_hashed_street(df, n_hash_buckets=32, street_col_candidates=None):
    """Add a small hashed numeric feature for street/address text fields.

    Adds `street_hash_0..N-1` as dense float columns containing one-hot-ish hashed values.
    Uses MD5-based hashing reduced to a bucket and then maps to a small integer vector.
    """
    if street_col_candidates is None:
        street_col_candidates = ['street1', 'street', 'address', 'mar_address', 'nearestintstreetname']
    colmap = {c.lower(): c for c in df.columns}
    street_col = None
    for cand in street_col_candidates:
        if cand.lower() in colmap:
            street_col = colmap[cand.lower()]
            break
    if street_col is None:
        return

    import hashlib
    # create a single integer hash bucket per row
    def row_hash(val):
        if pd.isna(val) or str(val).strip() == '':
            return -1
        h = hashlib.md5(str(val).encode('utf-8')).hexdigest()
        return int(h, 16) % n_hash_buckets

    buckets = df[street_col].apply(row_hash).fillna(-1).astype(int).to_numpy()
    # create N numeric columns with a one-hot style (0/1) encoded as floats; missing bucket => zeros
    for i in range(n_hash_buckets):
        colname = f'street_hash_{i}'
        df[colname] = (buckets == i).astype(float)


def _add_latlon_bins(df, bins=20, lat_col_candidates=None, lon_col_candidates=None):
    """Add coarse spatial bins for latitude/longitude and rounded lat/lon numeric features.

    Adds: lat_round, lon_round, lat_bin, lon_bin (bins numbered 0..bins-1, -1 for missing).
    """
    if lat_col_candidates is None:
        lat_col_candidates = ['latitude', 'mpdlatitude', 'lat']
    if lon_col_candidates is None:
        lon_col_candidates = ['longitude', 'mpdlongitude', 'lon']
    colmap = {c.lower(): c for c in df.columns}
    lat_col = None
    lon_col = None
    for cand in lat_col_candidates:
        if cand.lower() in colmap:
            lat_col = colmap[cand.lower()]
            break
    for cand in lon_col_candidates:
        if cand.lower() in colmap:
            lon_col = colmap[cand.lower()]
            break
    if lat_col is None or lon_col is None:
        return
    try:
        lat = pd.to_numeric(df[lat_col], errors='coerce')
        lon = pd.to_numeric(df[lon_col], errors='coerce')
    except Exception:
        lat = pd.to_numeric(df[lat_col].astype(str), errors='coerce')
        lon = pd.to_numeric(df[lon_col].astype(str), errors='coerce')

    df['lat_round'] = lat.round(3).fillna(0.0).astype(float)
    df['lon_round'] = lon.round(3).fillna(0.0).astype(float)

    try:
        # compute bins using quantiles if possible to get balanced bins; fallback to linear bins
        valid_lat = lat.dropna()
        valid_lon = lon.dropna()
        if len(valid_lat) >= bins and len(valid_lon) >= bins:
            # qcut may produce NaNs for duplicates; use rank-based discretization
            df['lat_bin'] = pd.qcut(lat.rank(method='first'), q=bins, labels=False, duplicates='drop')
            df['lon_bin'] = pd.qcut(lon.rank(method='first'), q=bins, labels=False, duplicates='drop')
        else:
            lat_min, lat_max = valid_lat.min() if len(valid_lat) > 0 else 0.0, valid_lat.max() if len(valid_lat) > 0 else 0.0
            lon_min, lon_max = valid_lon.min() if len(valid_lon) > 0 else 0.0, valid_lon.max() if len(valid_lon) > 0 else 0.0
            lat_span = (lat_max - lat_min) + 1e-6
            lon_span = (lon_max - lon_min) + 1e-6
            df['lat_bin'] = (((lat - lat_min) / lat_span) * bins).fillna(-1).astype(int).clip(lower=-1, upper=bins-1)
            df['lon_bin'] = (((lon - lon_min) / lon_span) * bins).fillna(-1).astype(int).clip(lower=-1, upper=bins-1)
    except Exception:
        # fallback: set -1 for bins
        df['lat_bin'] = -1
        df['lon_bin'] = -1


# Debugging code - to be removed or commented out in production
# python - <<'PY'
# import pandas as pd
# from data import generate_labels_for_df
# df = pd.read_csv('data.csv', nrows=50, low_memory=False)
# labs = generate_labels_for_df(df, n_buckets=100)
# print(df[['REPORTDATE','LATITUDE','LONGITUDE','ADDRESS','WARD']].head().to_string())
# print('labels:', list(labs[:20]))
# PY

# Command to run the training (to be executed in the terminal, not in the script)
# python train.py data.csv --model-type mlp --generate-labels --n-buckets 100 --epochs 5 --batch-size 256 --lr 1e-3