added the model

2025-09-27 12:14:26 -04:00
parent 4c84ab1f67
commit 0df2b0019b
28 changed files with 1633 additions and 1 deletions
--- a/roadcast/data.py
+++ b/roadcast/data.py
@@ -0,0 +1,413 @@
+import os
+import hashlib
+from datetime import datetime
+import pandas as pd
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from PIL import Image
+from torchvision import transforms
+
+
+class ImageFolderDataset(Dataset):
+    """A minimal image folder dataset expecting a structure: root/class_name/*.jpg"""
+    def __init__(self, root, transform=None):
+        self.root = root
+        self.samples = []  # list of (path, label)
+        classes = sorted([d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))])
+        self.class_to_idx = {c: i for i, c in enumerate(classes)}
+        for c in classes:
+            d = os.path.join(root, c)
+            for fname in os.listdir(d):
+                if fname.lower().endswith(('.png', '.jpg', '.jpeg')):
+                    self.samples.append((os.path.join(d, fname), self.class_to_idx[c]))
+
+        self.transform = transform or transforms.Compose([
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+        ])
+
+    def __len__(self):
+        return len(self.samples)
+
+    def __getitem__(self, idx):
+        path, label = self.samples[idx]
+        img = Image.open(path).convert('RGB')
+        img = self.transform(img)
+        return img, label
+
+
+class CSVDataset(Dataset):
+    """Load classification tabular data from a single CSV file.
+
+    Expects a `label` column and numeric feature columns. Non-numeric columns are dropped.
+    """
+    def __init__(self, csv_path, feature_columns=None, label_column='label', transform=None, generate_labels=False, n_buckets=100, label_method='md5', label_store=None, feature_engineer=False, lat_lon_bins=20, nrows=None):
+        # read CSV with low_memory=False to avoid mixed-type warnings
+        if nrows is None:
+            self.df = pd.read_csv(csv_path, low_memory=False)
+        else:
+            self.df = pd.read_csv(csv_path, nrows=nrows, low_memory=False)
+        self.label_column = label_column
+
+        if generate_labels:
+            # generate deterministic labels based on selected columns
+            self.df[self.label_column] = generate_labels_for_df(self.df, n_buckets=n_buckets, method=label_method, label_store=label_store)
+
+        # optional simple feature engineering: extract date parts and lat/lon bins
+        if feature_engineer:
+            try:
+                _add_date_features(self.df)
+            except Exception:
+                pass
+            try:
+                _add_latlon_bins(self.df, bins=lat_lon_bins)
+            except Exception:
+                pass
+
+        if label_column not in self.df.columns:
+            raise ValueError(f"label column '{label_column}' not found in CSV; set generate_labels=True to create labels")
+
+        # determine feature columns if not provided (numeric columns except label)
+        if feature_columns is None:
+            feature_columns = [c for c in self.df.columns if c != label_column and pd.api.types.is_numeric_dtype(self.df[c])]
+        self.feature_columns = feature_columns
+        # coerce feature columns to numeric, fill NaNs with column mean (or 0), then standardize
+        features_df = self.df[self.feature_columns].apply(lambda c: pd.to_numeric(c, errors='coerce'))
+        # fill NaNs with column mean where possible, otherwise 0
+        initial_means = features_df.mean()
+        features_df = features_df.fillna(initial_means).fillna(0.0)
+
+        # drop columns that remain all-NaN after coercion/fill (unlikely after fillna(0.0)), to avoid NaNs
+        all_nan_cols = features_df.columns[features_df.isna().all()].tolist()
+        if len(all_nan_cols) > 0:
+            # remove from feature list so indices stay consistent
+            features_df = features_df.drop(columns=all_nan_cols)
+            self.feature_columns = [c for c in self.feature_columns if c not in all_nan_cols]
+
+        # recompute means/stds from the filled data so subtraction/division won't produce NaNs
+        col_means = features_df.mean()
+        col_stds = features_df.std().replace(0, 1.0).fillna(1.0)
+
+        # standardize using the recomputed stats
+        features_df = (features_df - col_means) / (col_stds + 1e-6)
+
+        self.feature_means = col_means.to_numpy(dtype=float)
+        self.feature_stds = col_stds.to_numpy(dtype=float)
+
+        self.features = torch.tensor(features_df.values, dtype=torch.float32)
+        self.labels = torch.tensor(pd.to_numeric(self.df[self.label_column], errors='coerce').fillna(0).astype(int).values, dtype=torch.long)
+
+    def __len__(self):
+        return len(self.df)
+
+    def __getitem__(self, idx):
+        return self.features[idx], int(self.labels[idx])
+
+
+def _normalize_str(x):
+    if pd.isna(x):
+        return ''
+    return str(x).strip().lower()
+
+
+def _normalize_date(x):
+    try:
+        # try parse common formats
+        dt = pd.to_datetime(x)
+        return dt.strftime('%Y-%m-%d')
+    except Exception:
+        return ''
+
+
+def generate_kmeans_labels(df, n_buckets=100, random_state=42, label_store=None):
+    """Generate labels by running k-means over numeric features (deterministic with seed).
+
+    This produces clusters that are predictable from numeric inputs and are therefore
+    better suited for training a numeric-feature MLP than arbitrary hash buckets.
+    """
+    # small pure-numpy k-means to avoid external dependency
+    import numpy as np
+
+    # select numeric columns only
+    num_df = df.select_dtypes(include=['number']).fillna(0.0)
+    if num_df.shape[0] == 0 or num_df.shape[1] == 0:
+        # fallback to hashing if no numeric columns
+        return generate_labels_for_df(df, n_buckets=n_buckets)
+
+    data = num_df.values.astype(float)
+    n_samples = data.shape[0]
+    rng = np.random.default_rng(random_state)
+
+    # If a label_store exists and contains centers, load and use them
+    import os
+    if label_store and os.path.exists(label_store):
+        try:
+            npz = np.load(label_store)
+            centers = npz['centers']
+            all_dists = np.linalg.norm(data[:, None, :] - centers[None, :, :], axis=2)
+            all_labels = np.argmin(all_dists, axis=1)
+            return pd.Series(all_labels, index=df.index)
+        except Exception:
+            # fall through to fitting
+            pass
+
+    # sample points to fit centers if dataset is large
+    sample_size = min(20000, n_samples)
+    if sample_size < n_samples:
+        idx = rng.choice(n_samples, size=sample_size, replace=False)
+        sample_data = data[idx]
+    else:
+        sample_data = data
+
+    # initialize centers by random sampling from sample_data
+    centers_idx = rng.choice(sample_data.shape[0], size=min(n_buckets, sample_data.shape[0]), replace=False)
+    centers = sample_data[centers_idx].astype(float)
+
+    # run a small number of iterations
+    max_iters = 10
+    for _ in range(max_iters):
+        # assign
+        dists = np.linalg.norm(sample_data[:, None, :] - centers[None, :, :], axis=2)
+        labels = np.argmin(dists, axis=1)
+        # recompute centers
+        new_centers = np.zeros_like(centers)
+        counts = np.zeros((centers.shape[0],), dtype=int)
+        for i, lab in enumerate(labels):
+            new_centers[lab] += sample_data[i]
+            counts[lab] += 1
+        for k in range(centers.shape[0]):
+            if counts[k] > 0:
+                new_centers[k] = new_centers[k] / counts[k]
+            else:
+                # reinitialize empty cluster
+                new_centers[k] = sample_data[rng.integers(0, sample_data.shape[0])]
+        # check convergence (centers change small)
+        shift = np.linalg.norm(new_centers - centers, axis=1).max()
+        centers = new_centers
+        if shift < 1e-4:
+            break
+
+    # assign labels for all data
+    all_dists = np.linalg.norm(data[:, None, :] - centers[None, :, :], axis=2)
+    all_labels = np.argmin(all_dists, axis=1)
+    # persist centers if requested
+    if label_store:
+        try:
+            np.savez_compressed(label_store, centers=centers)
+        except Exception:
+            pass
+    return pd.Series(all_labels, index=df.index)
+
+
+def generate_labels_for_df(df, n_buckets=100, method='md5', label_store=None):
+    """Generate deterministic bucket labels 1..n_buckets from rows using selected columns.
+
+    Uses: report_dat, latitude, longitude, street1, street2, ward, injuries, fatalities.
+    Produces reproducible labels via md5 hashing of a normalized feature string.
+    """
+    if method == 'kmeans':
+        return generate_kmeans_labels(df, n_buckets=n_buckets, label_store=label_store)
+
+    # Be flexible about column names (case variations and alternate names).
+    colmap = {c.lower(): c for c in df.columns}
+
+    def get_col(*candidates):
+        for cand in candidates:
+            key = cand.lower()
+            if key in colmap:
+                return colmap[key]
+        return None
+
+    report_col = get_col('report_dat', 'reportdate', 'fromdate', 'lastupdatedate')
+    lat_col = get_col('latitude', 'mpdlatitude', 'lat')
+    lon_col = get_col('longitude', 'mpdlongitude', 'lon')
+    street1_col = get_col('street1', 'address', 'mar_address', 'nearestintstreetname')
+    street2_col = get_col('street2', 'nearestintstreetname')
+    ward_col = get_col('ward')
+
+    inj_cols = [c for c in df.columns if 'INJUR' in c.upper()]
+    fat_cols = [c for c in df.columns if 'FATAL' in c.upper()]
+
+    uid = get_col('crimeid', 'eventid', 'objectid', 'ccn')
+
+    def row_to_bucket(row):
+        parts = []
+        # date
+        parts.append(_normalize_date(row.get(report_col, '') if report_col else ''))
+        # lat/lon rounded
+        lat = row.get(lat_col, '') if lat_col else ''
+        lon = row.get(lon_col, '') if lon_col else ''
+        try:
+            parts.append(str(round(float(lat), 5)) if pd.notna(lat) and lat != '' else '')
+        except Exception:
+            parts.append('')
+        try:
+            parts.append(str(round(float(lon), 5)) if pd.notna(lon) and lon != '' else '')
+        except Exception:
+            parts.append('')
+
+        # streets and ward
+        parts.append(_normalize_str(row.get(street1_col, '') if street1_col else ''))
+        parts.append(_normalize_str(row.get(street2_col, '') if street2_col else ''))
+        parts.append(_normalize_str(row.get(ward_col, '') if ward_col else ''))
+
+        # injuries: sum any injury-like columns
+        inj_sum = 0
+        for c in inj_cols:
+            try:
+                v = row.get(c, 0)
+                inj_sum += int(v) if pd.notna(v) and v != '' else 0
+            except Exception:
+                pass
+        parts.append(str(inj_sum))
+
+        # fatalities: sum any fatal-like columns
+        fat_sum = 0
+        for c in fat_cols:
+            try:
+                v = row.get(c, 0)
+                fat_sum += int(v) if pd.notna(v) and v != '' else 0
+            except Exception:
+                pass
+        parts.append(str(fat_sum))
+
+        # fallback uid
+        if uid:
+            parts.append(str(row.get(uid, '')))
+
+        s = '|'.join(parts)
+        h = hashlib.md5(s.encode('utf-8')).hexdigest()
+        val = int(h, 16) % n_buckets
+        return val
+
+    return df.apply(row_to_bucket, axis=1)
+
+
+def _add_date_features(df, date_col_candidates=None):
+    """Add simple date-derived numeric columns to the dataframe.
+
+    Adds: report_year, report_month, report_day, report_weekday, report_hour (where available).
+    If no date column is found, function is a no-op.
+    """
+    if date_col_candidates is None:
+        date_col_candidates = ['report_dat', 'reportdate', 'fromdate', 'lastupdatedate', 'date', 'occur_date']
+    colmap = {c.lower(): c for c in df.columns}
+    date_col = None
+    for cand in date_col_candidates:
+        if cand.lower() in colmap:
+            date_col = colmap[cand.lower()]
+            break
+    if date_col is None:
+        return
+    try:
+        ser = pd.to_datetime(df[date_col], errors='coerce')
+    except Exception:
+        ser = pd.to_datetime(df[date_col].astype(str), errors='coerce')
+
+    df['report_year'] = ser.dt.year.fillna(-1).astype(float)
+    df['report_month'] = ser.dt.month.fillna(-1).astype(float)
+    df['report_day'] = ser.dt.day.fillna(-1).astype(float)
+    df['report_weekday'] = ser.dt.weekday.fillna(-1).astype(float)
+    # hour may not exist; if parsing fails we'll get NaN
+    df['report_hour'] = ser.dt.hour.fillna(-1).astype(float)
+
+
+def _add_hashed_street(df, n_hash_buckets=32, street_col_candidates=None):
+    """Add a small hashed numeric feature for street/address text fields.
+
+    Adds `street_hash_0..N-1` as dense float columns containing one-hot-ish hashed values.
+    Uses MD5-based hashing reduced to a bucket and then maps to a small integer vector.
+    """
+    if street_col_candidates is None:
+        street_col_candidates = ['street1', 'street', 'address', 'mar_address', 'nearestintstreetname']
+    colmap = {c.lower(): c for c in df.columns}
+    street_col = None
+    for cand in street_col_candidates:
+        if cand.lower() in colmap:
+            street_col = colmap[cand.lower()]
+            break
+    if street_col is None:
+        return
+
+    import hashlib
+    # create a single integer hash bucket per row
+    def row_hash(val):
+        if pd.isna(val) or str(val).strip() == '':
+            return -1
+        h = hashlib.md5(str(val).encode('utf-8')).hexdigest()
+        return int(h, 16) % n_hash_buckets
+
+    buckets = df[street_col].apply(row_hash).fillna(-1).astype(int).to_numpy()
+    # create N numeric columns with a one-hot style (0/1) encoded as floats; missing bucket => zeros
+    for i in range(n_hash_buckets):
+        colname = f'street_hash_{i}'
+        df[colname] = (buckets == i).astype(float)
+
+
+def _add_latlon_bins(df, bins=20, lat_col_candidates=None, lon_col_candidates=None):
+    """Add coarse spatial bins for latitude/longitude and rounded lat/lon numeric features.
+
+    Adds: lat_round, lon_round, lat_bin, lon_bin (bins numbered 0..bins-1, -1 for missing).
+    """
+    if lat_col_candidates is None:
+        lat_col_candidates = ['latitude', 'mpdlatitude', 'lat']
+    if lon_col_candidates is None:
+        lon_col_candidates = ['longitude', 'mpdlongitude', 'lon']
+    colmap = {c.lower(): c for c in df.columns}
+    lat_col = None
+    lon_col = None
+    for cand in lat_col_candidates:
+        if cand.lower() in colmap:
+            lat_col = colmap[cand.lower()]
+            break
+    for cand in lon_col_candidates:
+        if cand.lower() in colmap:
+            lon_col = colmap[cand.lower()]
+            break
+    if lat_col is None or lon_col is None:
+        return
+    try:
+        lat = pd.to_numeric(df[lat_col], errors='coerce')
+        lon = pd.to_numeric(df[lon_col], errors='coerce')
+    except Exception:
+        lat = pd.to_numeric(df[lat_col].astype(str), errors='coerce')
+        lon = pd.to_numeric(df[lon_col].astype(str), errors='coerce')
+
+    df['lat_round'] = lat.round(3).fillna(0.0).astype(float)
+    df['lon_round'] = lon.round(3).fillna(0.0).astype(float)
+
+    try:
+        # compute bins using quantiles if possible to get balanced bins; fallback to linear bins
+        valid_lat = lat.dropna()
+        valid_lon = lon.dropna()
+        if len(valid_lat) >= bins and len(valid_lon) >= bins:
+            # qcut may produce NaNs for duplicates; use rank-based discretization
+            df['lat_bin'] = pd.qcut(lat.rank(method='first'), q=bins, labels=False, duplicates='drop')
+            df['lon_bin'] = pd.qcut(lon.rank(method='first'), q=bins, labels=False, duplicates='drop')
+        else:
+            lat_min, lat_max = valid_lat.min() if len(valid_lat) > 0 else 0.0, valid_lat.max() if len(valid_lat) > 0 else 0.0
+            lon_min, lon_max = valid_lon.min() if len(valid_lon) > 0 else 0.0, valid_lon.max() if len(valid_lon) > 0 else 0.0
+            lat_span = (lat_max - lat_min) + 1e-6
+            lon_span = (lon_max - lon_min) + 1e-6
+            df['lat_bin'] = (((lat - lat_min) / lat_span) * bins).fillna(-1).astype(int).clip(lower=-1, upper=bins-1)
+            df['lon_bin'] = (((lon - lon_min) / lon_span) * bins).fillna(-1).astype(int).clip(lower=-1, upper=bins-1)
+    except Exception:
+        # fallback: set -1 for bins
+        df['lat_bin'] = -1
+        df['lon_bin'] = -1
+
+
+# Debugging code - to be removed or commented out in production
+# python - <<'PY'
+# import pandas as pd
+# from data import generate_labels_for_df
+# df = pd.read_csv('data.csv', nrows=50, low_memory=False)
+# labs = generate_labels_for_df(df, n_buckets=100)
+# print(df[['REPORTDATE','LATITUDE','LONGITUDE','ADDRESS','WARD']].head().to_string())
+# print('labels:', list(labs[:20]))
+# PY
+
+# Command to run the training (to be executed in the terminal, not in the script)
+# python train.py data.csv --model-type mlp --generate-labels --n-buckets 100 --epochs 5 --batch-size 256 --lr 1e-3
+