Files
VTHacks13/roadcast/debug_labels.py
samarthjain2023 0df2b0019b added the model
2025-09-27 12:14:26 -04:00

77 lines
2.5 KiB
Python

import pandas as pd
import hashlib
from data import _normalize_str, _normalize_date
p='data.csv'
df=pd.read_csv(p, nrows=50, low_memory=False)
print('Columns:', list(df.columns))
colmap = {c.lower(): c for c in df.columns}
def get_col(*candidates):
for cand in candidates:
key = cand.lower()
if key in colmap:
return colmap[key]
return None
report_col = get_col('report_dat', 'reportdate', 'fromdate', 'lastupdatedate')
lat_col = get_col('latitude', 'mpdlatitude', 'lat')
lon_col = get_col('longitude', 'mpdlongitude', 'lon')
street1_col = get_col('street1', 'address', 'mar_address', 'nearestintstreetname')
street2_col = get_col('street2', 'nearestintstreetname')
ward_col = get_col('ward')
inj_cols = [c for c in df.columns if 'INJUR' in c.upper()]
fat_cols = [c for c in df.columns if 'FATAL' in c.upper()]
uid = get_col('crimeid', 'eventid', 'objectid', 'ccn')
print('Resolved columns:')
print('report_col=', report_col)
print('lat_col=', lat_col)
print('lon_col=', lon_col)
print('street1_col=', street1_col)
print('street2_col=', street2_col)
print('ward_col=', ward_col)
print('inj_cols=', inj_cols[:10])
print('fat_cols=', fat_cols[:10])
print('uid=', uid)
for i, row in df.iterrows():
parts = []
parts.append(_normalize_date(row.get(report_col, '') if report_col else ''))
lat = row.get(lat_col, '') if lat_col else ''
lon = row.get(lon_col, '') if lon_col else ''
try:
parts.append(str(round(float(lat), 5)) if pd.notna(lat) and lat != '' else '')
except Exception:
parts.append('')
try:
parts.append(str(round(float(lon), 5)) if pd.notna(lon) and lon != '' else '')
except Exception:
parts.append('')
parts.append(_normalize_str(row.get(street1_col, '') if street1_col else ''))
parts.append(_normalize_str(row.get(street2_col, '') if street2_col else ''))
parts.append(_normalize_str(row.get(ward_col, '') if ward_col else ''))
inj_sum = 0
for c in inj_cols:
try:
v = row.get(c, 0)
inj_sum += int(v) if pd.notna(v) and v != '' else 0
except Exception:
pass
parts.append(str(inj_sum))
fat_sum = 0
for c in fat_cols:
try:
v = row.get(c, 0)
fat_sum += int(v) if pd.notna(v) and v != '' else 0
except Exception:
pass
parts.append(str(fat_sum))
if uid:
parts.append(str(row.get(uid, '')))
s='|'.join(parts)
h=hashlib.md5(s.encode('utf-8')).hexdigest()
val=int(h,16)%100
print(i, 'label=', val, 's="'+s+'"')