"""
Lag ranking algorithms: Data-driven order selection for time series models.
Choosing the right model order remains one of the most challenging aspects
of time series analysis. Too few lags and we miss important dynamics; too
many and we overfit, capturing noise as signal. This module implements our
solution: systematic lag evaluation using multiple criteria.
We've found that no single criterion works best in all cases. AIC tends
toward larger models, BIC prefers parsimony, and PACF captures statistical
significance. By combining these perspectives, we achieve more robust order
selection than any single method provides.
The implementation reflects lessons learned from thousands of model fits
across diverse domains. Financial data often needs more lags than theory
suggests, sensor data benefits from conservative selection, and economic
series require careful balance. This module encodes that experience into
algorithms that adapt to your data's characteristics.
"""
from __future__ import annotations
import logging
from numbers import Integral
# Keep for now, might be used elsewhere or can be removed if not.
from typing import Optional, cast # Added Optional
import numpy as np
from tsbootstrap.utils.types import ModelTypes
from tsbootstrap.utils.validate import validate_integers, validate_literal_type
logger = logging.getLogger(__name__)
[docs]
class RankLags:
"""
Intelligent lag selection through multi-criteria evaluation.
We designed this class to solve a recurring problem: how to choose model
order without extensive manual experimentation. The approach combines
information criteria (AIC/BIC), statistical tests (PACF), and conservative
heuristics to identify robust lag specifications.
The key insight is that different criteria excel in different contexts.
AIC works well for prediction, BIC for identifying true order, and PACF
for detecting significant lags. By evaluating all three and applying
conservative selection rules, we achieve more reliable results than any
single method.
The implementation caches fitted models when requested, enabling efficient
exploration of the model space. This proves valuable for bootstrap methods
that need to understand model uncertainty across different specifications.
Methods
-------
rank_lags_by_aic_bic()
Rank lags using information criteria that balance fit and complexity
rank_lags_by_pacf()
Rank lags by partial autocorrelation strength
estimate_conservative_lag()
Select a robust lag order by combining multiple criteria
get_model(order)
Retrieve a cached model for detailed analysis
Examples
--------
>>> from tsbootstrap import RankLags
>>> import numpy as np
>>> # Generate AR(2) process for demonstration
>>> np.random.seed(42)
>>> X = np.random.normal(size=(100, 1))
>>> rank_obj = RankLags(X, model_type='ar')
>>>
>>> # Get conservative lag estimate
>>> rank_obj.estimate_conservative_lag()
2
>>>
>>> # See detailed rankings by different criteria
>>> aic_ranks, bic_ranks = rank_obj.rank_lags_by_aic_bic()
>>> print(f"AIC ranking: {aic_ranks[:3]}") # Top 3 by AIC
>>> print(f"BIC ranking: {bic_ranks[:3]}") # Top 3 by BIC
"""
_tags = {"python_dependencies": "statsmodels"}
_model_type: ModelTypes # Class-level annotation for the backing field
def __init__(
self,
X: np.ndarray,
model_type: ModelTypes,
max_lag: int = 10, # Changed Integral to int
y: Optional[np.ndarray] = None, # Added Optional and type hint
save_models: bool = False,
) -> None:
"""
Initialize the RankLags object.
Parameters
----------
X : np.ndarray
The input data.
model_type : str
The type of model to fit. One of 'ar', 'arima', 'sarima', 'var', 'arch'.
max_lag : int, optional, default=10
Maximum lag to consider.
y : np.ndarray, optional, default=None
Exogenous variables to include in the model.
save_models : bool, optional, default=False
Whether to save the models.
"""
self.X = X
self.max_lag = max_lag
self.model_type = model_type # Reverted: Let the property setter handle it
self.y = y
self.save_models = save_models
self.models = []
@property
def X(self) -> np.ndarray:
"""
The input data.
Returns
-------
np.ndarray
The input data.
"""
return self._X
@X.setter
def X(self, value: np.ndarray) -> None:
"""
Set the input data.
Parameters
----------
X : np.ndarray
The input data.
"""
if not isinstance(value, np.ndarray):
raise TypeError("X must be a numpy array.")
self._X = value
@property
def max_lag(self) -> int: # Changed Integral to int
"""
Maximum lag to consider.
Returns
-------
int
Maximum lag to consider.
"""
return self._max_lag
@max_lag.setter
def max_lag(self, value: int) -> None: # Changed Integral to int
"""
Set the maximum lag to consider.
Parameters
----------
max_lag : int
Maximum lag to consider.
"""
validate_integers(value, min_value=cast(Integral, 1))
self._max_lag = value
@property
def model_type(self) -> ModelTypes:
"""
The type of model to fit.
Returns
-------
ModelTypes
The type of model to fit.
"""
return self._model_type
@model_type.setter
def model_type(self, value: ModelTypes) -> None:
"""
Set the type of model to fit.
Parameters
----------
value : ModelTypes
The type of model to fit. One of 'ar', 'arima', 'sarima', 'var', 'arch'.
"""
validate_literal_type(value, ModelTypes)
self._model_type = value # Removed .lower() as ModelTypes are already lowercase literals
@property
def y(self) -> Optional[np.ndarray]: # Added Optional
"""
Exogenous variables to include in the model.
Returns
-------
np.ndarray
Exogenous variables to include in the model.
"""
return self._y
@y.setter
def y(self, value: Optional[np.ndarray]) -> None: # Added Optional
"""
Set the exogenous variables to include in the model.
Parameters
----------
y : np.ndarray
Exogenous variables to include in the model.
"""
if value is not None and not isinstance(value, np.ndarray):
raise TypeError("y must be a numpy array or None.") # Modified error message
self._y = value
[docs]
def rank_lags_by_aic_bic(self):
"""
Rank lags based on Akaike information criterion (AIC) and Bayesian information criterion (BIC).
Returns
-------
Tuple[np.ndarray, np.ndarray]
aic_ranked_lags: Lags ranked by AIC.
bic_ranked_lags: Lags ranked by BIC.
"""
from tsbootstrap.backends.adapter import fit_with_backend
aic_values = []
bic_values = []
# Prepare data for backend
# Ensure X is properly shaped for the backend
if self.X.ndim == 1:
X_backend = self.X
elif self.X.ndim == 2 and self.X.shape[1] == 1:
# Single column, flatten for univariate models
X_backend = self.X.flatten()
else:
# Multi-column data
X_backend = self.X if self.model_type == "var" else self.X[:, 0].flatten()
for lag in range(1, self.max_lag + 1):
try:
# Use backend directly for better performance
model = fit_with_backend(
model_type=self.model_type,
endog=X_backend,
exog=self.y,
order=lag,
seasonal_order=None, # RankLags doesn't use seasonal models
force_backend="statsmodels",
return_backend=False, # Get adapter for compatibility
)
except Exception as e:
logger.warning(
f"An error occurred during fitting for lag {lag}. Skipping remaining lags."
)
logger.debug(f"{e}")
# If fitting fails for a lag, assign a high AIC/BIC to deprioritize it
aic_values.append(np.inf)
bic_values.append(np.inf)
if self.save_models:
self.models.append(None) # Add None to keep index alignment if saving
continue # Continue to the next lag
if model is not None:
if self.save_models:
self.models.append(model)
current_aic = np.inf
current_bic = np.inf
if hasattr(model, "aic"):
current_aic = model.aic
else:
logger.warning(f"Model for lag {lag} does not have 'aic' attribute. Using inf.")
if hasattr(model, "bic"):
current_bic = model.bic
else:
logger.warning(f"Model for lag {lag} does not have 'bic' attribute. Using inf.")
aic_values.append(current_aic)
bic_values.append(current_bic)
else:
# Model is None, even if no exception was caught (should be rare)
logger.warning(f"Model for lag {lag} is None. Assigning inf to AIC/BIC.")
aic_values.append(np.inf)
bic_values.append(np.inf)
if self.save_models:
self.models.append(None)
aic_ranked_lags = np.argsort(aic_values) + 1
bic_ranked_lags = np.argsort(bic_values) + 1
return aic_ranked_lags, bic_ranked_lags
[docs]
def rank_lags_by_pacf(self) -> np.ndarray:
"""
Rank lags based on Partial Autocorrelation Function (PACF) values.
Returns
-------
np.ndarray
Lags ranked by PACF values.
"""
from statsmodels.tsa.stattools import pacf
# Can only compute partial correlations for lags up to 50% of the sample size. We use the minimum of max_lag and third of the sample size, to allow for other parameters and trends to be included in the model.
pacf_values = pacf(self.X, nlags=max(min(self.max_lag, self.X.shape[0] // 3 - 1), 1))[1:]
ci = 1.96 / np.sqrt(len(self.X))
significant_lags = np.where(np.abs(pacf_values) > ci)[0] + 1
return significant_lags
[docs]
def estimate_conservative_lag(self) -> int:
"""
Estimate a conservative lag value by considering various metrics.
Returns
-------
int
A conservative lag value.
"""
aic_ranked_lags, bic_ranked_lags = self.rank_lags_by_aic_bic()
if not aic_ranked_lags.size: # Check if aic_ranked_lags is empty
logger.warning(
"No lags identified by AIC/BIC (possibly due to model fitting issues). "
"Cannot estimate a conservative lag. Defaulting to lag 1."
)
return 1 # Default to 1 if no information from AIC/BIC
# Start with the intersection of AIC and BIC ranked lags
candidate_lags = set(aic_ranked_lags).intersection(bic_ranked_lags)
# If univariate data, try to incorporate PACF results
if self.X.shape[1] == 1:
pacf_ranked_lags = self.rank_lags_by_pacf()
if pacf_ranked_lags.size > 0: # If PACF found significant lags
# Refine candidate_lags with PACF results
candidate_lags = candidate_lags.intersection(pacf_ranked_lags)
# If pacf_ranked_lags is empty, we proceed with the AIC/BIC intersection
if not candidate_lags:
# If no consensus lag is found (either initially or after PACF),
# default to the best AIC-ranked lag.
logger.info(
"No consensus lag found among AIC, BIC (and PACF if applicable). "
"Using the best lag according to AIC (lag %d).",
aic_ranked_lags[0],
)
return aic_ranked_lags[0] # Best AIC lag
else:
# Return the smallest lag from the consensus set.
selected_lag = min(candidate_lags)
logger.info("Estimated conservative lag: %d", selected_lag)
return selected_lag
[docs]
def get_model(self, order: int):
"""
Retrieve a previously fitted model given an order.
Parameters
----------
order : int
Order of the model to retrieve.
Returns
-------
Union[AutoRegResultsWrapper, ARIMAResultsWrapper, SARIMAXResultsWrapper, VARResultsWrapper, ARCHModelResult]
The fitted model.
"""
return self.models[order - 1] if self.save_models else None