added the model

2025-09-27 12:14:26 -04:00
parent 4c84ab1f67
commit 0df2b0019b
28 changed files with 1633 additions and 1 deletions
--- a/roadcast/inspect_csv.py
+++ b/roadcast/inspect_csv.py
@@ -0,0 +1,34 @@
+import pandas as pd
+
+p = 'data.csv'
+print('Reading first 2000 rows of', p)
+df = pd.read_csv(p, nrows=2000, low_memory=False)
+print('Columns:', list(df.columns))
+cols = ['report_dat','latitude','longitude','street1','street2','ward','injuries','fatalities']
+print('\nField stats for label-generator columns:')
+for c in cols:
+    if c in df.columns:
+        ser = df[c]
+        try:
+            unique = ser.dropna().unique()[:5].tolist()
+        except Exception:
+            unique = []
+        print(f"{c}: present dtype={ser.dtype} n_unique={ser.nunique(dropna=False)} n_null={int(ser.isna().sum())} sample_values={unique}")
+    else:
+        print(f"{c}: MISSING")
+
+# If labels already present, show distribution
+if 'label' in df.columns:
+    print('\nLabel column present in sample:')
+    print(df['label'].value_counts().head(20))
+else:
+    print('\nLabel column not present in sample')
+
+# Also show per-column fraction NaN for numeric columns
+num_cols = df.select_dtypes(include=['number']).columns.tolist()
+print('\nNumeric columns and NaN fraction (first 20):')
+for c in num_cols[:20]:
+    ser = df[c]
+    print(f"{c}: n={len(ser)} null_frac={ser.isna().mean():.4f} min={ser.min()} max={ser.max()}")
+
+print('\nDone')