Source code for tsbootstrap.time_series_simulator

"""
Time series simulation: Generating synthetic realizations with statistical fidelity.

This module provides sophisticated simulation capabilities for time series models,
enabling the generation of synthetic data that preserves the statistical properties
of fitted models. Through careful implementation of model-specific algorithms,
we create realizations that are statistically indistinguishable from the original
process while incorporating appropriate randomness.

The simulation framework serves multiple critical purposes: validating bootstrap
methods through Monte Carlo studies, generating forecast scenarios, and testing
system behavior under various conditions. Each simulation algorithm has been
validated against theoretical properties to ensure statistical correctness.
"""

from numbers import Integral
from typing import List, Optional, Union

import numpy as np

from tsbootstrap.utils.types import ModelTypes
from tsbootstrap.utils.validate import (
    validate_fitted_model,
    validate_integers,
    validate_literal_type,
    validate_rng,
    validate_X_and_y,
)



[docs]
class TimeSeriesSimulator:
    """
    Advanced simulation engine for time series model realizations.

    This class implements state-of-the-art simulation algorithms for various
    time series models, from simple autoregressive processes to complex
    GARCH specifications. We've designed the implementation to balance
    statistical accuracy with computational efficiency, ensuring that simulated
    series maintain the essential properties of the underlying stochastic process.

    The simulator handles critical details that are often overlooked: proper
    initialization through burn-in periods, correct propagation of multivariate
    dependencies, and appropriate treatment of model-specific constraints. Each
    simulation method has been validated against known theoretical results and
    empirical benchmarks.

    Our architecture supports both single realizations and bulk generation for
    Monte Carlo studies. The flexible design accommodates various model types
    while maintaining a consistent interface, simplifying integration into
    larger analytical workflows.

    Attributes
    ----------
    n_samples : int
        Length of the time series to simulate, calibrated from the fitted model.
        This ensures consistency between original and simulated data.

    n_features : int
        Dimensionality of the time series. Supports both univariate (n_features=1)
        and multivariate simulations with proper cross-series dependencies.

    burnin : int
        Number of initial observations to discard, allowing the process to reach
        its stationary distribution. Automatically calibrated based on series length.
    """

    _tags = {"python_dependencies": ["arch", "statsmodels"]}

    def __init__(
        self,
        fitted_model,
        X_fitted: np.ndarray,
        rng=None,
    ) -> None:
        """
        Initialize the TimeSeriesSimulator class.

        Parameters
        ----------
        fitted_model: FittedModelTypes
            A fitted model object.
        X_fitted: np.ndarray
            Array of fitted values.
        rng: Optional[Union[Integral, Generator]], optional
            Random number generator instance. Defaults to None.
        """
        self.fitted_model = fitted_model
        self.X_fitted = X_fitted
        self.rng = rng
        self.n_samples, self.n_features = self.X_fitted.shape
        self.burnin = min(100, self.n_samples // 3)

    @property
    def fitted_model(self):
        """Get the fitted model."""
        return self._fitted_model

    @fitted_model.setter
    def fitted_model(self, fitted_model) -> None:
        """Set the fitted model, ensuring it's validated first."""
        validate_fitted_model(fitted_model)
        self._fitted_model = fitted_model

    @property
    def X_fitted(self) -> np.ndarray:
        """Get the array of fitted values."""
        return self._X_fitted

    @X_fitted.setter
    def X_fitted(self, value: np.ndarray) -> None:
        """
        Set the array of fitted values.

        Parameters
        ----------
        value: np.ndarray
            Array of fitted values to set.
        """
        if not isinstance(value, np.ndarray):
            raise TypeError(f"X_fitted must be a NumPy array. Got {type(value)}.")
        if not np.issubdtype(value.dtype, np.floating):
            raise TypeError(f"X_fitted dtype must be a float type. Got {value.dtype}.")
        from arch.univariate.base import ARCHModelResult
        from statsmodels.tsa.vector_ar.var_model import VARResultsWrapper

        model_is_var = isinstance(self.fitted_model, VARResultsWrapper)
        model_is_arch = isinstance(self.fitted_model, ARCHModelResult)
        self._X_fitted, _ = validate_X_and_y(
            value, None, model_is_var=model_is_var, model_is_arch=model_is_arch
        )

    @property
    def rng(self):
        """Get the random number generator instance."""
        return self._rng

    @rng.setter
    def rng(self, rng) -> None:
        """
        Set the random number generator instance.

        Parameters
        ----------
        rng: Optional[Union[Integral, Generator]]
            Random number generator instance.
        """
        self._rng = validate_rng(rng, allow_seed=True)

    def _validate_ar_simulation_params(self, params: dict) -> None:
        """
        Validate the parameters necessary for the simulation.
        """
        required_params = ["resids_lags", "resids_coefs", "resids"]
        for param in required_params:
            if params.get(param) is None:
                # logger.error(f"{param} is not provided.")
                raise ValueError(f"{param} must be provided for the AR model.")

    def _simulate_ar_residuals(
        self,
        lags: np.ndarray,
        coefs: np.ndarray,
        init: np.ndarray,
        max_lag: Integral,
        n_samples: int,
    ) -> np.ndarray:
        """
        Simulates an Autoregressive (AR) process with given lags, coefficients, initial values, and random errors.

        Parameters
        ----------
        lags: np.ndarray
            The lags to be used in the AR process. Can be non-consecutive, but when called from `generate_samples_sieve`, it will be sorted.
        coefs: np.ndarray
            The coefficients corresponding to each lag. Of shape (1, len(lags)). Sorted by `generate_samples_sieve` corresponding to the sorted `lags`.
        init: np.ndarray
            The initial values for the simulation. Should be at least as long as the maximum lag.
        n_samples: int
            The number of samples to generate.

        Returns
        -------
        np.ndarray
            The simulated AR process as a 1D NumPy array.

        Raises
        ------
        ValueError
            If `lags` or `coefs` are not provided.
            If `coefs` is not a 1D NumPy array.
            If `coefs` is not the same length as `lags`.
            If `init` is not the same length as `max_lag`.

        TypeError
            If `lags` is not an integer or a list of integers.
        """
        random_errors = self.rng.normal(size=n_samples)

        if len(init) < max_lag:  # type: ignore
            raise ValueError(
                "Length of 'init' must be at least as long as the maximum lag in 'lags'"
            )

        # In case init is 2d with shape (X, 1), convert it to 1d
        init = init.ravel()
        series = np.zeros(n_samples, dtype=init.dtype)
        series[:max_lag] = init

        # Calculate trend terms directly
        trend_terms = 0
        if hasattr(self.fitted_model, "model") and hasattr(self.fitted_model.model, "trend"):
            trend = self.fitted_model.model.trend
            if trend == "n":  # no trend
                trend_terms = 0
            elif trend in ["c", "t"]:  # constant or time trend
                trend_terms = 1
            elif trend == "ct":  # constant + time trend
                trend_terms = 2

        if trend_terms > 0:
            intercepts = self.fitted_model.params[:trend_terms].reshape(1, trend_terms)
        else:
            intercepts = np.array([[]])

        # Loop through the series, calculating each value based on the lagged values, coefficients, random error, and trend term
        for t in range(max_lag, n_samples):
            ar_term = 0
            for i in range(len(lags)):
                ar_term_iter = coefs[0, i] * series[t - lags[i]]
                ar_term += ar_term_iter

            trend_term = 0
            # If the trend is 'c' or 'ct', add a constant term
            if self.fitted_model.model.trend in ["c", "ct"]:
                trend_term += intercepts[0, 0]
            # If the trend is 't' or 'ct', add a linear time trend
            if self.fitted_model.model.trend in ["t", "ct"]:
                trend_term += intercepts[0, -1] * t

            series[t] = ar_term + trend_term + random_errors[t]

        return series


[docs]
    def simulate_ar_process(
        self,
        resids_lags: Union[Integral, List[Integral]],
        resids_coefs: np.ndarray,
        resids: np.ndarray,
        n_samples: int,
    ) -> np.ndarray:
        """
        Simulate AR process from the fitted model.

        Parameters
        ----------
        resids_lags: Union[Integral, List[Integral]]
            The lags to be used in the AR process. Can be non-consecutive, but when called from `generate_samples_sieve`, it will be sorted.
        resids_coefs: np.ndarray
            The coefficients corresponding to each lag. Of shape (1, len(lags)). Sorted by `generate_samples_sieve` corresponding to the sorted `lags`.
        resids: np.ndarray
            The initial values for the simulation. Should be at least as long as the maximum lag.
        n_samples: int
            The number of samples to generate.

        Returns
        -------
        np.ndarray
            The simulated AR process as a 1D NumPy array.

        Raises
        ------
        ValueError
            If `resids_lags`, `resids_coefs`, or `resids` are not provided.
            If `resids_coefs` is not a 1D NumPy array.
            If `resids_coefs` is not the same length as `resids_lags`.
            If `resids` is not the same length as `X_fitted`.

        TypeError
            If `fitted_model` is not an instance of `AutoRegResultsWrapper`.
            If `resids_lags` is not an integer or a list of integers.
        """
        from statsmodels.tsa.ar_model import AutoRegResultsWrapper

        validate_integers(resids_lags, min_value=1)  # type: ignore

        if not isinstance(self.fitted_model, AutoRegResultsWrapper):
            # logger.error("fitted_model must be an instance of AutoRegResultsWrapper.")
            raise TypeError(
                f"fitted_model must be an instance of AutoRegResultsWrapper. Got {type(self.fitted_model)}."
            )

        if self.n_features > 1:
            raise ValueError("Only univariate time series are supported for the AR model.")
        if n_samples != len(resids):
            raise ValueError(
                "Length of 'resids' must be the same as the number of samples to generate."
            )

        # In case resids is 2d with shape (X, 1), convert it to 1d
        resids = resids.ravel()
        # In case X_fitted is 2d with shape (X, 1), convert it to 1d
        X_fitted = self.X_fitted.ravel()
        # Generate the bootstrap series
        bootstrap_series = np.zeros(n_samples, dtype=X_fitted.dtype)
        # Convert resids_lags to a NumPy array if it is not already. When called from `generate_samples_sieve`, it will be sorted.
        resids_lags = (
            np.arange(1, resids_lags + 1)
            if isinstance(resids_lags, Integral)
            else np.array(sorted(resids_lags))
        )  # type: ignore
        # resids_lags.shape: (n_lags,)
        max_lag = np.max(resids_lags)  # type: ignore
        if resids_coefs.shape[0] != 1:
            raise ValueError("AR coefficients must be a 1D NumPy array of shape (1, X)")
        if resids_coefs.shape[1] != len(resids_lags):  # type: ignore
            raise ValueError("Length of 'resids_coefs' must be the same as the length of 'lags'")

        # Simulate residuals using the AR model
        simulated_residuals = self._simulate_ar_residuals(
            lags=resids_lags,  # type: ignore
            coefs=resids_coefs,
            init=resids[:max_lag],
            max_lag=max_lag,
            n_samples=n_samples,
        )
        # simulated_residuals.shape: (n_samples,)

        bootstrap_series[:max_lag] = X_fitted[:max_lag]

        # Loop through the series, calculating each value based on the lagged values, coefficients, and random error
        for t in range(max_lag, n_samples):
            lagged_values = bootstrap_series[t - resids_lags]  # type: ignore
            # lagged_values.shape: (n_lags,)
            lagged_values = lagged_values.reshape(-1, 1)
            # lagged_values.shape: (n_lags, 1)
            bootstrap_series[t] = resids_coefs @ lagged_values + simulated_residuals[t]

        return bootstrap_series.reshape(-1, 1)


    def _simulate_non_ar_residuals(self, n_samples: int) -> np.ndarray:
        """
        Simulate residuals according to the model type.

        Parameters
        ----------
        n_samples: int
            The number of samples to generate.

        Returns
        -------
        np.ndarray
            The simulated residuals.
        """
        from arch.univariate.base import ARCHModelResult
        from statsmodels.tsa.arima.model import ARIMAResultsWrapper
        from statsmodels.tsa.statespace.sarimax import SARIMAXResultsWrapper
        from statsmodels.tsa.vector_ar.var_model import VARResultsWrapper

        # self.rng is always a Generator instance. Generate an integer seed for methods that require it.
        current_sim_seed = self.rng.integers(0, 2**32 - 1)

        if isinstance(self.fitted_model, (ARIMAResultsWrapper, SARIMAXResultsWrapper)):
            # These models' simulate methods can take a Generator instance directly for random_state
            return self.fitted_model.simulate(
                nsimulations=n_samples + self.burnin,
                random_state=self.rng,
            )
        elif isinstance(self.fitted_model, VARResultsWrapper):
            # VARResultsWrapper.simulate_var takes an integer seed
            return self.fitted_model.simulate_var(
                steps=n_samples + self.burnin, seed=current_sim_seed
            )
        elif isinstance(self.fitted_model, ARCHModelResult):
            # Due to observed inconsistencies in arch library behavior in some environments
            # (ARCHModelResult.simulate missing, or model.simulate() rejecting random_state),
            # we fall back to calling model.simulate() without attempting to pass random_state
            # or using global np.random.seed(). This may compromise reproducibility for ARCH models
            # if the underlying arch model's simulate() method doesn't handle seeding consistently.

            sim_args = {
                "params": self.fitted_model.params,
                "nobs": n_samples + self.burnin,
                # No 'random_state' here
            }

            # Note: arch.__version__ and current_sim_seed are not used in this specific path
            # as we cannot reliably control seeding via arch's API in this scenario.

            simulation_output = self.fitted_model.model.simulate(**sim_args)  # type: ignore[attr-defined]
            return np.asarray(simulation_output.data.values)
        raise ValueError(f"Unsupported fitted model type {self.fitted_model}.")


[docs]
    def simulate_non_ar_process(self, n_samples: int) -> np.ndarray:
        """
        Simulate a time series from the fitted model.

        Parameters
        ----------
        n_samples: int
            The number of samples to generate.

        Returns
        -------
            np.ndarray: The simulated time series.
        """
        from statsmodels.tsa.arima.model import ARIMAResultsWrapper
        from statsmodels.tsa.statespace.sarimax import SARIMAXResultsWrapper
        from statsmodels.tsa.vector_ar.var_model import VARResultsWrapper

        simulated_residuals = self._simulate_non_ar_residuals(n_samples)
        simulated_residuals = np.reshape(simulated_residuals, (-1, self.n_features))
        # Discard the burn-in samples for certain models
        from arch.univariate.base import ARCHModelResult

        if isinstance(
            self.fitted_model,
            (
                VARResultsWrapper,
                ARIMAResultsWrapper,
                SARIMAXResultsWrapper,
                ARCHModelResult,
            ),
        ):
            simulated_residuals = simulated_residuals[self.burnin :]
        return self.X_fitted[:n_samples] + simulated_residuals[:n_samples]



[docs]
    def generate_samples_sieve(
        self,
        model_type: ModelTypes,
        resids_lags: Optional[Union[Integral, List[Integral]]] = None,
        resids_coefs: Optional[np.ndarray] = None,
        resids: Optional[np.ndarray] = None,
        n_samples: Optional[int] = None,
    ) -> np.ndarray:
        """
        Generate a bootstrap sample using the sieve bootstrap.

        Parameters
        ----------
        model_type: ModelTypes
            The model type used for the simulation.
        resids_lags: Optional[Union[Integral, List[Integral]]], optional
            The lags to be used in the AR process. Can be non-consecutive.
        resids_coefs: Optional[np.ndarray], optional
            The coefficients corresponding to each lag. Of shape (1, len(lags)).
        resids: Optional[np.ndarray], optional
            The initial values for the simulation. Should be at least as long as the maximum lag.
        n_samples: Optional[int], default=None
            The number of samples to generate. If None, uses self.n_samples.

        Returns
        -------
        np.ndarray
            The bootstrap sample.

        Raises
        ------
        ValueError
            If `resids_lags`, `resids_coefs`, or `resids` are not provided.
        """
        if n_samples is None:
            n_samples = self.n_samples

        validate_literal_type(model_type, ModelTypes)
        if model_type == "ar":
            self._validate_ar_simulation_params(
                {
                    "resids_lags": resids_lags,
                    "resids_coefs": resids_coefs,
                    "resids": resids,
                }
            )
            return self.simulate_ar_process(resids_lags, resids_coefs, resids, n_samples)  # type: ignore
        else:
            return self.simulate_non_ar_process(n_samples)


    def __repr__(self) -> str:
        return f"TimeSeriesSimulator(fitted_model={self.fitted_model}, n_samples={self.n_samples}, n_features={self.n_features})"

    def __str__(self) -> str:
        return f"TimeSeriesSimulator with {self.n_samples} samples and {self.n_features} features using fitted model {self.fitted_model}"

    def __eq__(self, other: object) -> bool:
        if isinstance(other, TimeSeriesSimulator):
            return (
                self.fitted_model == other.fitted_model
                and np.array_equal(self.X_fitted, other.X_fitted)
                and self.rng == other.rng
            )
        return False