Source code for vitalDSP.transforms.mfcc

"""
Signal Transforms Module for Physiological Signal Processing

This module provides comprehensive capabilities for physiological
signal processing including ECG, PPG, EEG, and other vital signs.

Author: vitalDSP Team
Date: 2025-01-27
Version: 1.0.0

Key Features:
- Object-oriented design with comprehensive classes
- Multiple processing methods and functions
- NumPy integration for numerical computations

Examples:
---------
Basic usage:
    >>> import numpy as np
    >>> from vitalDSP.transforms.mfcc import Mfcc
    >>> signal = np.random.randn(1000)
    >>> processor = Mfcc(signal)
    >>> result = processor.process()
    >>> print(f'Processing result: {result}')
"""

import numpy as np


[docs] class MFCC: """ A class to compute Mel-Frequency Cepstral Coefficients (MFCC) for audio signals. MFCCs are widely used in audio processing tasks, particularly in speech recognition. This class provides methods to preprocess an audio signal and compute its MFCCs, which represent the short-term power spectrum of a sound. Methods ------- dct : method Computes the Discrete Cosine Transform (DCT) of the filter bank energies. compute_mfcc : method Computes the MFCCs of the input signal. """ def __init__(self, signal, sample_rate=16000, num_filters=40, num_coefficients=13): """ Initialize the MFCC class with the signal and relevant parameters. Parameters ---------- signal : numpy.ndarray The input audio signal. sample_rate : int, optional The sample rate of the signal in Hertz (default is 16000 Hz). num_filters : int, optional The number of Mel filters to apply (default is 40). num_coefficients : int, optional The number of MFCC coefficients to extract (default is 13). Notes ----- - The signal is expected to be a 1D numpy array representing the audio data. - The sample rate should match the rate at which the audio was originally recorded. """ self.signal = signal self.sample_rate = sample_rate self.num_filters = num_filters self.num_coefficients = num_coefficients
[docs] def dct(self, signal): """ Compute the Discrete Cosine Transform (DCT) of the input signal. The DCT is applied to the filter bank energies to reduce the dimensionality and decorrelate the filter bank coefficients, producing the MFCCs. Parameters ---------- signal : numpy.ndarray The input signal or filter banks matrix from which DCT is computed. Returns ------- numpy.ndarray The DCT coefficients representing the MFCCs. Examples -------- >>> signal = np.array([[1, 2, 3], [4, 5, 6]]) >>> mfcc = MFCC(signal) >>> dct_result = mfcc.dct(signal) >>> print(dct_result) """ n = signal.shape[1] result = np.zeros((signal.shape[0], self.num_coefficients)) for k in range(1, self.num_coefficients + 1): result[:, k - 1] = np.sum( signal * np.cos(np.pi * (np.arange(n) + 0.5) * k / n), axis=1 ) return result * np.sqrt(2 / n)
[docs] def compute_mfcc(self): """ Compute the Mel-Frequency Cepstral Coefficients (MFCC) of the input signal. This method processes the input audio signal by applying pre-emphasis, framing, windowing, FFT, and filter banks, followed by the DCT to extract the MFCCs. Returns ------- numpy.ndarray A 2D array where each row contains the MFCCs for a frame of the signal. Steps ----- 1. Pre-emphasis: Emphasizes higher frequencies in the signal. 2. Framing: Divides the signal into overlapping frames. 3. Windowing: Applies a Hamming window to each frame to reduce spectral leakage. 4. FFT and Power Spectrum: Converts each frame to the frequency domain and computes the power spectrum. 5. Mel Filter Banks: Applies a set of filters to the power spectrum to obtain Mel frequency bands. 6. DCT: Computes the DCT of the log filter bank energies to obtain the MFCCs. Examples -------- >>> signal = np.sin(np.linspace(0, 10, 1000)) >>> mfcc = MFCC(signal) >>> mfcc_result = mfcc.compute_mfcc() >>> print(mfcc_result) """ # Step 1: Pre-emphasis emphasized_signal = np.append( self.signal[0], self.signal[1:] - 0.97 * self.signal[:-1] ) # Step 2: Framing frame_size = 0.025 # 25 ms frame_stride = 0.01 # 10 ms frame_length, frame_step = ( frame_size * self.sample_rate, frame_stride * self.sample_rate, ) signal_length = len(emphasized_signal) frame_length = int(round(frame_length)) frame_step = int(round(frame_step)) num_frames = ( int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step)) + 1 ) # Padding the signal to ensure that all frames have equal number of samples pad_signal_length = num_frames * frame_step + frame_length z = np.zeros((pad_signal_length - signal_length)) pad_signal = np.append(emphasized_signal, z) # Step 3: Windowing indices = ( np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile( np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1) ).T ) frames = pad_signal[indices.astype(np.int32, copy=False)] frames *= np.hamming(frame_length) # Step 4: FFT and Power Spectrum NFFT = 512 # FFT size mag_frames = np.absolute(np.fft.rfft(frames, NFFT)) pow_frames = (1.0 / NFFT) * (mag_frames**2) # Step 5: Mel Filter Banks low_freq_mel = 0 high_freq_mel = 2595 * np.log10(1 + (self.sample_rate / 2) / 700) mel_points = np.linspace(low_freq_mel, high_freq_mel, self.num_filters + 2) hz_points = 700 * (10 ** (mel_points / 2595) - 1) bin = np.floor((NFFT + 1) * hz_points / self.sample_rate) fbank = np.zeros((self.num_filters, int(np.floor(NFFT / 2 + 1)))) k_range = np.arange(fbank.shape[1]) for m in range(1, self.num_filters + 1): f_m_minus = int(bin[m - 1]) f_m = int(bin[m]) f_m_plus = int(bin[m + 1]) # Vectorized triangular filter — eliminates inner Python loops if f_m > f_m_minus: fbank[m - 1, f_m_minus:f_m] = (k_range[f_m_minus:f_m] - bin[m - 1]) / ( bin[m] - bin[m - 1] ) if f_m_plus > f_m: fbank[m - 1, f_m:f_m_plus] = (bin[m + 1] - k_range[f_m:f_m_plus]) / ( bin[m + 1] - bin[m] ) filter_banks = np.dot(pow_frames, fbank.T) filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) filter_banks = 20 * np.log10(filter_banks) # Convert to dB # Step 6: Mel-frequency Cepstral Coefficients (MFCCs) mfcc = self.dct(filter_banks) return mfcc