Source code for elliptic_toolkit.temporal_cv

import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import TimeSeriesSplit


[docs] class TemporalRollingCV(TimeSeriesSplit): """ Time-based cross-validation iterator that extends scikit-learn's TimeSeriesSplit to work with data that has explicit time step values (like the Elliptic Bitcoin dataset). This class inherits from TimeSeriesSplit and adds functionality to handle datasets where multiple samples can belong to the same time step. It maps the time step indices to actual row indices in the dataset, allowing it to be used with datasets like the Elliptic Bitcoin dataset. This CV strategy ensures that for each fold: 1. Training data comes from earlier time periods 2. The test set is a continuous time window following the training data 3. Each fold expands the training window and shifts the test window forward Parameters: ----------- n_splits : int, default=5 Number of splits to generate test_size : int, default=None Size of test window in time steps. If None, will be calculated based on n_splits. max_train_size : int, default=None Maximum number of time steps to use for training. If None, all available time steps will be used. gap : int, default=0 Number of time steps to skip between training and test sets time_col : str, default='time' Name of the column containing time step information """
[docs] def __init__( self, n_splits=5, *, test_size=None, max_train_size=None, gap=0, time_col='time'): super().__init__( n_splits=n_splits, test_size=test_size, max_train_size=max_train_size, gap=gap) self.time_col = time_col
[docs] def split(self, X, y=None, groups=None): """ Generate indices to split data into training and test sets. Unlike standard TimeSeriesSplit, this method works with explicit time step values and maps them to actual row indices in the dataset. This allows it to handle datasets where multiple samples can belong to the same time step. Parameters: ----------- X : array-like, DataFrame Training data. If DataFrame, must contain the column specified by `time_col`. Otherwise, time values must be passed through the `groups` parameter. y : array-like, optional Targets for the training data (ignored) groups : array-like, optional Time values for each sample if X doesn't have the time column specified by time_col Yields: ------- train_index : ndarray Indices of rows in the training set test_index : ndarray Indices of rows in the test set Notes: ------ The yielded indices refer to rows in the original dataset, not time steps. This makes the cross-validator compatible with scikit-learn's model selection tools. """ # Get time values if hasattr( X, self.time_col) and isinstance( getattr( X, self.time_col), pd.Series): times = getattr(X, self.time_col).values elif groups is not None: times = groups else: raise ValueError( f"X must have a '{ self.time_col}' column or time values must be passed as groups") if isinstance(times, np.ndarray) or isinstance(times, pd.Series): mod = np elif isinstance(times, torch.Tensor): mod = torch else: raise ValueError( "times must be a numpy array, torch tensor, or pandas Series") # Get unique time steps and sort unique_times = mod.unique(times) for train_times, test_times in super().split(unique_times): train_mask = mod.isin(times, unique_times[train_times]) test_mask = mod.isin(times, unique_times[test_times]) train_indices = mod.where(train_mask)[0] test_indices = mod.where(test_mask)[0] yield train_indices, test_indices