Source code for vitalDSP.physiological_features.ensemble_based_feature_extraction

"""
Physiological Features Module for Physiological Signal Processing

This module provides comprehensive capabilities for physiological
signal processing including ECG, PPG, EEG, and other vital signs.

Author: vitalDSP Team
Date: 2025-01-27
Version: 1.0.0

Key Features:
- Object-oriented design with comprehensive classes
- Multiple processing methods and functions
- NumPy integration for numerical computations
- Feature extraction capabilities

Examples:
--------
Basic usage:
    >>> import numpy as np
    >>> from vitalDSP.physiological_features.ensemble_based_feature_extraction import EnsembleBasedFeatureExtraction
    >>> signal = np.random.randn(1000)
    >>> processor = EnsembleBasedFeatureExtraction(signal)
    >>> result = processor.process()
    >>> print(f'Processing result: {result}')
"""

import numpy as np


[docs] class EnsembleBasedFeatureExtraction: """ A comprehensive class for feature extraction using ensemble methods such as Random Forest, Bagging, Boosting, and Stacking. This class implements various ensemble techniques to extract features from data, enhancing the representational capacity of the models and improving prediction accuracy. Methods ------- random_forest_features : function Extracts features using a custom Random Forest. bagging_features : function Extracts features using a Bagging ensemble. boosting_features : function Extracts features using a Boosting ensemble. stacking_features : function Extracts features using a Stacking ensemble. """ def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2): """ Initialize the EnsembleBasedFeatureExtraction class. Parameters ---------- n_estimators : int The number of estimators (trees) in the ensemble. Default is 100. max_depth : int or None The maximum depth of the trees. If None, the trees will expand until all leaves are pure or contain fewer than min_samples_split samples. Default is None. min_samples_split : int The minimum number of samples required to split an internal node. Default is 2. Notes ----- These parameters control the complexity of the ensemble models. A higher number of estimators generally improves performance but increases computational cost. Limiting tree depth (max_depth) and increasing min_samples_split helps prevent overfitting. """ self.n_estimators = n_estimators self.max_depth = max_depth self.min_samples_split = min_samples_split
[docs] def random_forest_features(self, X, y): """ Extract features using a custom Random Forest. This method builds multiple decision trees and aggregates their predictions to form a feature representation for each sample. Parameters ---------- X : numpy.ndarray The input features with shape (n_samples, n_features). y : numpy.ndarray The target labels with shape (n_samples,). Returns ------- features : numpy.ndarray The extracted features from the Random Forest, with shape (n_samples, n_estimators). Each feature corresponds to the prediction of a single tree in the forest. Examples -------- >>> X = np.array([[1, 2], [3, 4], [5, 6]]) >>> y = np.array([0, 1, 0]) >>> extractor = EnsembleBasedFeatureExtraction() >>> features = extractor.random_forest_features(X, y) >>> print(features) """ forest = [self._build_tree(X, y) for _ in range(self.n_estimators)] features = np.array([self._predict_tree(tree, X) for tree in forest]).T return features
[docs] def bagging_features(self, X, y): """ Extract features using a Bagging ensemble. Bagging (Bootstrap Aggregating) builds multiple trees on different subsets of the data, each generated by random sampling with replacement, and aggregates their predictions. Parameters ---------- X : numpy.ndarray The input features with shape (n_samples, n_features). y : numpy.ndarray The target labels with shape (n_samples,). Returns ------- aggregated_predictions : numpy.ndarray The aggregated predictions from the Bagging ensemble, with shape (n_samples,). This represents the averaged output of all trees. Examples -------- >>> X = np.array([[1, 2], [3, 4], [5, 6]]) >>> y = np.array([0, 1, 0]) >>> extractor = EnsembleBasedFeatureExtraction() >>> features = extractor.bagging_features(X, y) >>> print(features) """ bagged_models = [ self._build_tree(X[np.random.choice(len(X), len(X), replace=True)], y) for _ in range(self.n_estimators) ] predictions = np.array( [self._predict_tree(tree, X) for tree in bagged_models] ).T aggregated_predictions = np.mean(predictions, axis=1) return aggregated_predictions
[docs] def boosting_features(self, X, y, learning_rate=0.1): """ Extract features using a Boosting ensemble. Boosting builds trees sequentially, each one trying to correct the errors of the previous one. The final prediction is a weighted sum of the predictions from all trees. Parameters ---------- X : numpy.ndarray The input features with shape (n_samples, n_features). y : numpy.ndarray The target labels with shape (n_samples,). learning_rate : float, optional The learning rate for boosting, controlling the contribution of each tree. Default is 0.1. Returns ------- predictions : numpy.ndarray The extracted features from the Boosting ensemble, with shape (n_samples,). This represents the cumulative prediction after all boosting iterations. Examples -------- >>> X = np.array([[1, 2], [3, 4], [5, 6]]) >>> y = np.array([0, 1, 0]) >>> extractor = EnsembleBasedFeatureExtraction() >>> features = extractor.boosting_features(X, y, learning_rate=0.1) >>> print(features) """ predictions = np.zeros(X.shape[0]) for _ in range(self.n_estimators): residual = y - predictions tree = self._build_tree(X, residual) update = self._predict_tree(tree, X) predictions += learning_rate * update return predictions
[docs] def stacking_features(self, X, y, meta_model=None): """ Extract features using a Stacking ensemble. Stacking combines the predictions of multiple base models using a meta-model. The base models are first trained independently, and their predictions are used as inputs to the meta-model. Parameters ---------- X : numpy.ndarray The input features with shape (n_samples, n_features). y : numpy.ndarray The target labels with shape (n_samples,). meta_model : function or None, optional The meta-model used to aggregate the base models' predictions. If None, a simple average is used. Default is None. Returns ------- stacked_features : numpy.ndarray The extracted features from the Stacking ensemble, with shape (n_samples,). These features are the output of the meta-model applied to the base models' predictions. Examples -------- >>> X = np.array([[1, 2], [3, 4], [5, 6]]) >>> y = np.array([0, 1, 0]) >>> extractor = EnsembleBasedFeatureExtraction() >>> features = extractor.stacking_features(X, y) >>> print(features) """ base_models = [self._build_tree(X, y) for _ in range(self.n_estimators)] base_predictions = np.array( [self._predict_tree(tree, X) for tree in base_models] ).T if meta_model is None: meta_model = lambda preds: np.mean(preds, axis=1) stacked_features = meta_model(base_predictions) return stacked_features
def _build_tree(self, X, y, depth=0): """ Build a decision tree recursively. This is a helper method used by the ensemble methods to create decision trees. Parameters ---------- X : numpy.ndarray The input features for training the tree. y : numpy.ndarray The target labels for training the tree. depth : int The current depth of the tree. Returns ------- tree : tuple or float A tuple representing the decision tree or a float representing a leaf node with the predicted value. Notes ----- The tree is built by recursively finding the best split that minimizes the variance of the target values in each node. """ if len(y) <= self.min_samples_split or ( self.max_depth is not None and depth >= self.max_depth ): return np.mean(y) feature, threshold = self._best_split(X, y) if feature is None or threshold is None: # Safeguard for no valid split return np.mean(y) left_indices = X[:, feature] < threshold right_indices = ~left_indices left_tree = self._build_tree(X[left_indices], y[left_indices], depth + 1) right_tree = self._build_tree(X[right_indices], y[right_indices], depth + 1) return (feature, threshold, left_tree, right_tree) def _best_split(self, X, y): """ Find the best feature and threshold to split the data. This is a helper method used by the decision tree builder to find the optimal split. Parameters ---------- X : numpy.ndarray The input features. y : numpy.ndarray The target labels. Returns ------- best_feature : int The index of the best feature to split on. best_threshold : float The value of the best threshold to split on. best_score : float The score of the best split. Notes ----- The best split is determined by evaluating all possible splits and selecting the one that minimizes the weighted variance of the target values in the left and right nodes. """ best_feature, best_threshold, best_score = None, None, float("inf") if X.shape[0] <= 1: # Handle edge case where splitting is not feasible return best_feature, best_threshold for feature in range(X.shape[1]): thresholds = np.unique(X[:, feature]) for threshold in thresholds: left_indices = X[:, feature] < threshold right_indices = ~left_indices if np.sum(left_indices) == 0 or np.sum(right_indices) == 0: continue # Skip if one of the groups is empty left_score = np.mean((y[left_indices] - np.mean(y[left_indices])) ** 2) right_score = np.mean( (y[right_indices] - np.mean(y[right_indices])) ** 2 ) score = left_score * np.sum(left_indices) + right_score * np.sum( right_indices ) if score < best_score: best_feature, best_threshold, best_score = feature, threshold, score return best_feature, best_threshold def _predict_tree(self, tree, X): """ Predict the output for a given input using a decision tree. This is a helper method used by the ensemble methods to generate predictions. Parameters ---------- tree : tuple or float The decision tree used for prediction. It can be a tuple representing a decision node or a float representing a leaf node. X : numpy.ndarray The input features to predict. Returns ------- predictions : numpy.ndarray The predicted values for the input features. Notes ----- If the tree is a leaf node, it returns the same prediction for all inputs. Otherwise, it recursively traverses the tree to make predictions. """ if not isinstance(tree, tuple): return np.full(X.shape[0], tree) feature, threshold, left_tree, right_tree = tree left_indices = X[:, feature] < threshold right_indices = ~left_indices predictions = np.empty(X.shape[0]) predictions[left_indices] = self._predict_tree(left_tree, X[left_indices]) predictions[right_indices] = self._predict_tree(right_tree, X[right_indices]) return predictions