Skip to content

Utilities

Utility functions for the ClimAID package.

This module provides a collection of helper utilities for handling climate and epidemiological datasets used throughout ClimAID.

It includes functionality for: - Data ingestion and validation - Cleaning and preprocessing of datasets - Temporal train/test splitting for modeling workflows - Basic normalization and summary statistics

These utilities are designed to support reproducible and consistent data preparation across the ClimAID pipeline.

Notes

  • Functions in this module are independent of modeling and reporting layers.
  • Intended for internal use, but can be used externally for custom workflows.

Author

Avik Sam

Created

November 2025

build_district_tree(district_list)

Convert district keys like: IND_pune_maharashtra

into hierarchical structure: country → state → district

Source code in climaid\utils.py
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
def build_district_tree(district_list):
    """
    Convert district keys like:
    IND_pune_maharashtra

    into hierarchical structure:
    country → state → district
    """

    tree = defaultdict(lambda: defaultdict(list))

    for d in district_list:

        parts = d.split("_")

        if len(parts) >= 3:
            country = parts[0].upper()
            district = parts[1].title()
            state = parts[2].title()
        elif len(parts) == 2:
            country = "UNKNOWN"
            district = parts[0].title()
            state = parts[1].title()
        else:
            continue

        tree[country][state].append((district, d))

    return tree

check_data_consistency(df, key_cols=['District', 'time'])

Verify that key identifiers (e.g., District, time) are unique and complete.

Source code in climaid\utils.py
122
123
124
125
126
127
128
129
130
131
132
133
def check_data_consistency(df, key_cols=["District", "time"]):
    """
    Verify that key identifiers (e.g., District, time) are unique and complete.
    """
    duplicates = df.duplicated(subset=key_cols).sum()
    if duplicates > 0:
        print(f"Warning: {duplicates} duplicate entries found based on {key_cols}")
    missing = df[key_cols].isna().sum().sum()
    if missing > 0:
        print(f"Warning: {missing} missing values in key columns {key_cols}")
    else:
        print("Data consistency check passed")

clean_numeric_column(series)

Clean messy numeric data like '7.71-09' or '295.2005/092' and convert to float.

Source code in climaid\utils.py
67
68
69
70
71
72
73
74
def clean_numeric_column(series):
    """
    Clean messy numeric data like '7.71-09' or '295.2005/092' and convert to float.
    """
    s = series.astype(str)
    s = s.str.replace(r"[^0-9eE.\-]", "", regex=True)
    s = s.str.replace(r'(?<=\d)-(?=\d{2,}$)', 'e-', regex=True)
    return pd.to_numeric(s, errors="coerce")

ensure_directory(path)

Create directory if it does not exist.

Source code in climaid\utils.py
55
56
57
58
59
60
def ensure_directory(path):
    """
    Create directory if it does not exist.
    """
    os.makedirs(path, exist_ok=True)
    return path

load_csv_safe(filepath, parse_dates=['time'])

Safely load a CSV file and parse datetime columns if present.

Source code in climaid\utils.py
40
41
42
43
44
45
46
47
48
49
50
51
52
def load_csv_safe(filepath, parse_dates=["time"]):
    """
    Safely load a CSV file and parse datetime columns if present.
    """
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File not found: {filepath}")
    try:
        df = pd.read_csv(filepath)
        if "time" in df.columns:
            df["time"] = pd.to_datetime(df["time"], errors="coerce")
        return df
    except Exception as e:
        raise RuntimeError(f"Error loading {filepath}: {e}")

normalize_features(df, cols)

Normalize selected numeric columns to [0, 1] range using MinMaxScaler.

Source code in climaid\utils.py
77
78
79
80
81
82
83
def normalize_features(df, cols):
    """
    Normalize selected numeric columns to [0, 1] range using MinMaxScaler.
    """
    scaler = MinMaxScaler()
    df[cols] = scaler.fit_transform(df[cols])
    return df

pretty_country(code)

Convert ISO3 country code to human readable name.

Example

pretty_country("IND") 'India'

Source code in climaid\utils.py
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
def pretty_country(code: str) -> str:
    """
    Convert ISO3 country code to human readable name.

    Example
    -------
    >>> pretty_country("IND")
    'India'
    """

    if not code:
        return code

    code = code.upper()

    return COUNTRY_NAMES.get(code, code)

print_summary(df, label='Data')

Print summary statistics and missing value info.

Source code in climaid\utils.py
108
109
110
111
112
113
114
115
116
117
def print_summary(df, label="Data"):
    """
    Print summary statistics and missing value info.
    """
    print(f"\n--- {label} Summary ---")
    print(f"Shape: {df.shape}")
    if "time" in df.columns:
        print(f"Date range: {df['time'].min()}{df['time'].max()}")
    print(f"Missing values:\n{df.isna().sum()}")
    print("---------------------------")

split_train_test(df, date_col='time', cutoff_year=2020)

Split a dataset into training and testing subsets by year. - Training: all data before cutoff_year - Testing: all data after cutoff_year

Source code in climaid\utils.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def split_train_test(df, date_col="time", cutoff_year=2020):
    """
    Split a dataset into training and testing subsets by year.
    - Training: all data before `cutoff_year`
    - Testing: all data after `cutoff_year`
    """
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    df["year"] = df[date_col].dt.year

    df_train = df[df["year"] < cutoff_year].copy()
    df_test = df[df["year"] > cutoff_year].copy()
    return df_train, df_test