Source code for vitalDSP.physiological_features.ensemble_based_feature_extraction
"""
Physiological Features Module for Physiological Signal Processing
This module provides comprehensive capabilities for physiological
signal processing including ECG, PPG, EEG, and other vital signs.
Author: vitalDSP Team
Date: 2025-01-27
Version: 1.0.0
Key Features:
- Object-oriented design with comprehensive classes
- Multiple processing methods and functions
- NumPy integration for numerical computations
- Feature extraction capabilities
Examples:
--------
Basic usage:
>>> import numpy as np
>>> from vitalDSP.physiological_features.ensemble_based_feature_extraction import EnsembleBasedFeatureExtraction
>>> signal = np.random.randn(1000)
>>> processor = EnsembleBasedFeatureExtraction(signal)
>>> result = processor.process()
>>> print(f'Processing result: {result}')
"""
import numpy as np
[docs]
class EnsembleBasedFeatureExtraction:
"""
A comprehensive class for feature extraction using ensemble methods such as Random Forest, Bagging, Boosting, and Stacking.
This class implements various ensemble techniques to extract features from data, enhancing the representational capacity of the models and improving prediction accuracy.
Methods
-------
random_forest_features : function
Extracts features using a custom Random Forest.
bagging_features : function
Extracts features using a Bagging ensemble.
boosting_features : function
Extracts features using a Boosting ensemble.
stacking_features : function
Extracts features using a Stacking ensemble.
"""
def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2):
"""
Initialize the EnsembleBasedFeatureExtraction class.
Parameters
----------
n_estimators : int
The number of estimators (trees) in the ensemble. Default is 100.
max_depth : int or None
The maximum depth of the trees. If None, the trees will expand until all leaves are pure or contain fewer than min_samples_split samples. Default is None.
min_samples_split : int
The minimum number of samples required to split an internal node. Default is 2.
Notes
-----
These parameters control the complexity of the ensemble models. A higher number of estimators generally improves performance but increases computational cost. Limiting tree depth (max_depth) and increasing min_samples_split helps prevent overfitting.
"""
self.n_estimators = n_estimators
self.max_depth = max_depth
self.min_samples_split = min_samples_split
[docs]
def random_forest_features(self, X, y):
"""
Extract features using a custom Random Forest.
This method builds multiple decision trees and aggregates their predictions to form a feature representation for each sample.
Parameters
----------
X : numpy.ndarray
The input features with shape (n_samples, n_features).
y : numpy.ndarray
The target labels with shape (n_samples,).
Returns
-------
features : numpy.ndarray
The extracted features from the Random Forest, with shape (n_samples, n_estimators). Each feature corresponds to the prediction of a single tree in the forest.
Examples
--------
>>> X = np.array([[1, 2], [3, 4], [5, 6]])
>>> y = np.array([0, 1, 0])
>>> extractor = EnsembleBasedFeatureExtraction()
>>> features = extractor.random_forest_features(X, y)
>>> print(features)
"""
forest = [self._build_tree(X, y) for _ in range(self.n_estimators)]
features = np.array([self._predict_tree(tree, X) for tree in forest]).T
return features
[docs]
def bagging_features(self, X, y):
"""
Extract features using a Bagging ensemble.
Bagging (Bootstrap Aggregating) builds multiple trees on different subsets of the data, each generated by random sampling with replacement, and aggregates their predictions.
Parameters
----------
X : numpy.ndarray
The input features with shape (n_samples, n_features).
y : numpy.ndarray
The target labels with shape (n_samples,).
Returns
-------
aggregated_predictions : numpy.ndarray
The aggregated predictions from the Bagging ensemble, with shape (n_samples,). This represents the averaged output of all trees.
Examples
--------
>>> X = np.array([[1, 2], [3, 4], [5, 6]])
>>> y = np.array([0, 1, 0])
>>> extractor = EnsembleBasedFeatureExtraction()
>>> features = extractor.bagging_features(X, y)
>>> print(features)
"""
bagged_models = [
self._build_tree(X[np.random.choice(len(X), len(X), replace=True)], y)
for _ in range(self.n_estimators)
]
predictions = np.array(
[self._predict_tree(tree, X) for tree in bagged_models]
).T
aggregated_predictions = np.mean(predictions, axis=1)
return aggregated_predictions
[docs]
def boosting_features(self, X, y, learning_rate=0.1):
"""
Extract features using a Boosting ensemble.
Boosting builds trees sequentially, each one trying to correct the errors of the previous one. The final prediction is a weighted sum of the predictions from all trees.
Parameters
----------
X : numpy.ndarray
The input features with shape (n_samples, n_features).
y : numpy.ndarray
The target labels with shape (n_samples,).
learning_rate : float, optional
The learning rate for boosting, controlling the contribution of each tree. Default is 0.1.
Returns
-------
predictions : numpy.ndarray
The extracted features from the Boosting ensemble, with shape (n_samples,). This represents the cumulative prediction after all boosting iterations.
Examples
--------
>>> X = np.array([[1, 2], [3, 4], [5, 6]])
>>> y = np.array([0, 1, 0])
>>> extractor = EnsembleBasedFeatureExtraction()
>>> features = extractor.boosting_features(X, y, learning_rate=0.1)
>>> print(features)
"""
predictions = np.zeros(X.shape[0])
for _ in range(self.n_estimators):
residual = y - predictions
tree = self._build_tree(X, residual)
update = self._predict_tree(tree, X)
predictions += learning_rate * update
return predictions
[docs]
def stacking_features(self, X, y, meta_model=None):
"""
Extract features using a Stacking ensemble.
Stacking combines the predictions of multiple base models using a meta-model. The base models are first trained independently, and their predictions are used as inputs to the meta-model.
Parameters
----------
X : numpy.ndarray
The input features with shape (n_samples, n_features).
y : numpy.ndarray
The target labels with shape (n_samples,).
meta_model : function or None, optional
The meta-model used to aggregate the base models' predictions. If None, a simple average is used. Default is None.
Returns
-------
stacked_features : numpy.ndarray
The extracted features from the Stacking ensemble, with shape (n_samples,). These features are the output of the meta-model applied to the base models' predictions.
Examples
--------
>>> X = np.array([[1, 2], [3, 4], [5, 6]])
>>> y = np.array([0, 1, 0])
>>> extractor = EnsembleBasedFeatureExtraction()
>>> features = extractor.stacking_features(X, y)
>>> print(features)
"""
base_models = [self._build_tree(X, y) for _ in range(self.n_estimators)]
base_predictions = np.array(
[self._predict_tree(tree, X) for tree in base_models]
).T
if meta_model is None:
meta_model = lambda preds: np.mean(preds, axis=1)
stacked_features = meta_model(base_predictions)
return stacked_features
def _build_tree(self, X, y, depth=0):
"""
Build a decision tree recursively.
This is a helper method used by the ensemble methods to create decision trees.
Parameters
----------
X : numpy.ndarray
The input features for training the tree.
y : numpy.ndarray
The target labels for training the tree.
depth : int
The current depth of the tree.
Returns
-------
tree : tuple or float
A tuple representing the decision tree or a float representing a leaf node with the predicted value.
Notes
-----
The tree is built by recursively finding the best split that minimizes the variance of the target values in each node.
"""
if len(y) <= self.min_samples_split or (
self.max_depth is not None and depth >= self.max_depth
):
return np.mean(y)
feature, threshold = self._best_split(X, y)
if feature is None or threshold is None: # Safeguard for no valid split
return np.mean(y)
left_indices = X[:, feature] < threshold
right_indices = ~left_indices
left_tree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
right_tree = self._build_tree(X[right_indices], y[right_indices], depth + 1)
return (feature, threshold, left_tree, right_tree)
def _best_split(self, X, y):
"""
Find the best feature and threshold to split the data.
This is a helper method used by the decision tree builder to find the optimal split.
Parameters
----------
X : numpy.ndarray
The input features.
y : numpy.ndarray
The target labels.
Returns
-------
best_feature : int
The index of the best feature to split on.
best_threshold : float
The value of the best threshold to split on.
best_score : float
The score of the best split.
Notes
-----
The best split is determined by evaluating all possible splits and selecting the one that minimizes the weighted variance of the target values in the left and right nodes.
"""
best_feature, best_threshold, best_score = None, None, float("inf")
if X.shape[0] <= 1: # Handle edge case where splitting is not feasible
return best_feature, best_threshold
for feature in range(X.shape[1]):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
left_indices = X[:, feature] < threshold
right_indices = ~left_indices
if np.sum(left_indices) == 0 or np.sum(right_indices) == 0:
continue # Skip if one of the groups is empty
left_score = np.mean((y[left_indices] - np.mean(y[left_indices])) ** 2)
right_score = np.mean(
(y[right_indices] - np.mean(y[right_indices])) ** 2
)
score = left_score * np.sum(left_indices) + right_score * np.sum(
right_indices
)
if score < best_score:
best_feature, best_threshold, best_score = feature, threshold, score
return best_feature, best_threshold
def _predict_tree(self, tree, X):
"""
Predict the output for a given input using a decision tree.
This is a helper method used by the ensemble methods to generate predictions.
Parameters
----------
tree : tuple or float
The decision tree used for prediction. It can be a tuple representing a decision node or a float representing a leaf node.
X : numpy.ndarray
The input features to predict.
Returns
-------
predictions : numpy.ndarray
The predicted values for the input features.
Notes
-----
If the tree is a leaf node, it returns the same prediction for all inputs. Otherwise, it recursively traverses the tree to make predictions.
"""
if not isinstance(tree, tuple):
return np.full(X.shape[0], tree)
feature, threshold, left_tree, right_tree = tree
left_indices = X[:, feature] < threshold
right_indices = ~left_indices
predictions = np.empty(X.shape[0])
predictions[left_indices] = self._predict_tree(left_tree, X[left_indices])
predictions[right_indices] = self._predict_tree(right_tree, X[right_indices])
return predictions