vitalDSP Data Loader Tutorial

This notebook demonstrates the comprehensive data loading capabilities of vitalDSP.

Contents

  1. Basic Loading

  2. Format-Specific Examples

  3. Multi-Channel Data

  4. Data Validation

  5. Metadata Extraction

  6. Data Export

  7. Advanced Features

  8. Real-World Examples

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from vitalDSP.utils.data_processing.data_loader import (
    DataLoader,
    DataFormat,
    SignalType,
    load_signal,
    load_multi_channel
)

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

1. Basic Loading

Creating Sample Data

# Create sample ECG data
fs = 250  # Sampling rate
duration = 10  # seconds
t = np.linspace(0, duration, fs * duration)

# Simulate ECG signal
ecg = np.sin(2 * np.pi * 1.2 * t) + 0.3 * np.sin(2 * np.pi * 2.4 * t)
ecg += 0.1 * np.random.randn(len(t))

# Create DataFrame
df = pd.DataFrame({
    'time': t,
    'ecg': ecg
})

# Save to CSV
df.to_csv('sample_ecg.csv', index=False)

print(f"Created sample ECG data: {len(df)} samples at {fs} Hz")

Loading CSV Data

# Load the CSV file
loader = DataLoader('sample_ecg.csv', sampling_rate=250.0, signal_type='ecg')
data = loader.load(time_column='time')

print(f"Loaded {len(data)} samples")
print(f"Columns: {list(data.columns)}")
print(f"\nFirst few rows:")
print(data.head())

Quick Loading with Convenience Function

# Quick load
data_quick = load_signal('sample_ecg.csv', sampling_rate=250)

print(f"Quick loaded: {data_quick.shape}")

Visualizing Loaded Data

plt.figure(figsize=(14, 5))
plt.plot(data['time'], data['ecg'])
plt.xlabel('Time (s)')
plt.ylabel('ECG (mV)')
plt.title('Loaded ECG Signal')
plt.grid(True)
plt.show()

2. Format-Specific Examples

JSON Format

import json

# Create JSON data with metadata
json_data = {
    'sampling_rate': 250,
    'signal_type': 'ecg',
    'duration': 10,
    'data': [
        {'time': float(t), 'ecg': float(e)} 
        for t, e in zip(data['time'][:100], data['ecg'][:100])
    ]
}

with open('sample_ecg.json', 'w') as f:
    json.dump(json_data, f)

# Load JSON
loader_json = DataLoader('sample_ecg.json')
data_json = loader_json.load()

print(f"Loaded from JSON: {len(data_json)} samples")
print(f"Metadata: {loader_json.metadata}")

NumPy Format

# Save as .npy
np.save('sample_ecg.npy', data['ecg'].values)

# Load .npy
loader_npy = DataLoader('sample_ecg.npy')
data_npy = loader_npy.load()

print(f"Loaded from .npy: {data_npy.shape}")
print(f"Metadata: {loader_npy.metadata}")

Multi-signal NPZ Format

# Create multi-signal data
ppg = 0.5 * np.sin(2 * np.pi * 1.0 * t) + 0.1 * np.random.randn(len(t))
resp = 0.3 * np.sin(2 * np.pi * 0.3 * t) + 0.05 * np.random.randn(len(t))

# Save as .npz
np.savez('multi_signals.npz', ecg=ecg, ppg=ppg, resp=resp, time=t)

# Load .npz
loader_npz = DataLoader('multi_signals.npz')
data_npz = loader_npz.load()

print(f"Loaded signals: {list(data_npz.keys())}")
print(f"ECG shape: {data_npz['ecg'].shape}")
print(f"PPG shape: {data_npz['ppg'].shape}")

3. Multi-Channel Data

Creating Multi-Channel Dataset

# Create multi-channel dataset
df_multi = pd.DataFrame({
    'time': t,
    'ECG': ecg,
    'PPG': ppg,
    'RESP': resp
})

df_multi.to_csv('multi_channel.csv', index=False)

print(f"Created multi-channel data: {df_multi.shape}")
print(f"Channels: {list(df_multi.columns)}")

Loading Specific Channels

# Load only ECG and PPG
loader_multi = DataLoader('multi_channel.csv')
data_selected = loader_multi.load(columns=['time', 'ECG', 'PPG'])

print(f"Selected channels: {list(data_selected.columns)}")

Using load_multi_channel

# Load as dictionary of channels
channels = load_multi_channel('multi_channel.csv', channels=['ECG', 'PPG', 'RESP'])

for name, signal in channels.items():
    print(f"{name}: {len(signal)} samples, mean={signal.mean():.3f}, std={signal.std():.3f}")

Visualizing Multi-Channel Data

fig, axes = plt.subplots(3, 1, figsize=(14, 10), sharex=True)

# Plot each channel
axes[0].plot(t, channels['ECG'])
axes[0].set_ylabel('ECG (mV)')
axes[0].set_title('ECG Signal')
axes[0].grid(True)

axes[1].plot(t, channels['PPG'])
axes[1].set_ylabel('PPG (AU)')
axes[1].set_title('PPG Signal')
axes[1].grid(True)

axes[2].plot(t, channels['RESP'])
axes[2].set_ylabel('RESP (AU)')
axes[2].set_xlabel('Time (s)')
axes[2].set_title('Respiratory Signal')
axes[2].grid(True)

plt.tight_layout()
plt.show()

4. Data Validation

Creating Data with Issues

# Create data with NaN and Inf
signal_with_issues = ecg.copy()
signal_with_issues[100:110] = np.nan  # Add NaN
signal_with_issues[500] = np.inf  # Add Inf

df_issues = pd.DataFrame({
    'time': t,
    'signal': signal_with_issues
})

df_issues.to_csv('signal_with_issues.csv', index=False)

print(f"Created signal with {df_issues['signal'].isna().sum()} NaN values")
print(f"Created signal with {np.isinf(df_issues['signal']).sum()} Inf values")

Loading with Validation

import warnings

# Enable validation (will show warnings)
print("Loading with validation enabled:")
loader_validate = DataLoader('signal_with_issues.csv', validate=True)
data_validated = loader_validate.load()

print(f"\nData loaded: {data_validated.shape}")

5. Metadata Extraction

Extracting Comprehensive Info

# Load with metadata extraction
loader = DataLoader('multi_channel.csv', sampling_rate=250.0, signal_type='ecg')
data = loader.load(time_column='time')

# Get full info
info = loader.get_info()

print("=== Data Information ===")
print(f"File: {info['file_path']}")
print(f"Format: {info['format']}")
print(f"Signal Type: {info['signal_type']}")
print(f"Sampling Rate: {info['sampling_rate']} Hz")
print(f"\nMetadata:")
for key, value in info['metadata'].items():
    print(f"  {key}: {value}")

Automatic Sampling Rate Detection

# Load without specifying sampling rate
loader_auto = DataLoader('multi_channel.csv')
data_auto = loader_auto.load(time_column='time')

print(f"Computed sampling rate: {loader_auto.metadata.get('computed_sampling_rate', 'N/A')} Hz")
print(f"Expected: 250 Hz")

6. Data Export

Exporting to Multiple Formats

# Load original data
loader = DataLoader('multi_channel.csv')
data = loader.load()

# Export to different formats
print("Exporting to multiple formats...")

loader.export(data, 'exported_data.csv')
print("✓ Exported to CSV")

loader.export(data, 'exported_data.json')
print("✓ Exported to JSON")

loader.export(data, 'exported_data.pkl')
print("✓ Exported to Pickle")

print("\nExport complete!")

Verifying Exported Data

# Load back the exported JSON
loader_verify = DataLoader('exported_data.json')
data_verify = loader_verify.load()

print(f"Original shape: {data.shape}")
print(f"Exported and reloaded shape: {data_verify.shape}")
print(f"\nData integrity: {'✓ PASSED' if data.shape == data_verify.shape else '✗ FAILED'}")

7. Advanced Features

Loading from NumPy Array

# Create array
signal_array = np.random.randn(1000)

# Load from array
loader = DataLoader()
df_from_array = loader.load_from_array(
    signal_array,
    sampling_rate=250.0,
    signal_type='ecg'
)

print(f"Loaded from array: {df_from_array.shape}")
print(f"Sampling rate: {loader.sampling_rate} Hz")
print(f"Signal type: {loader.signal_type.value}")

Loading from DataFrame

# Create DataFrame
df_custom = pd.DataFrame({
    'ecg': np.random.randn(500),
    'ppg': np.random.randn(500)
})

# Load from DataFrame
loader = DataLoader()
df_loaded = loader.load_from_dataframe(df_custom, sampling_rate=100.0)

print(f"Loaded from DataFrame: {df_loaded.shape}")
print(f"Metadata: {loader.metadata}")

Format Detection

# Test format detection
test_files = [
    'data.csv',
    'data.json',
    'data.xlsx',
    'data.npy',
    'data.mat'
]

print("=== Format Detection ===")
for filename in test_files:
    # Create empty file
    Path(filename).touch()
    
    loader = DataLoader(filename)
    print(f"{filename:15} -> {loader.format.value}")
    
    # Clean up
    Path(filename).unlink()

List Supported Formats

# Get supported formats
formats = DataLoader.list_supported_formats()

print("=== Supported Formats ===")
for i, fmt in enumerate(formats, 1):
    if fmt != 'unknown':
        req = DataLoader.get_format_requirements(fmt)
        if req:
            print(f"{i}. {fmt.upper():10} - {req.get('description', 'N/A')}")
        else:
            print(f"{i}. {fmt.upper()}")

8. Real-World Examples

Example 1: ECG Analysis Pipeline

# Complete ECG analysis workflow

# 1. Load data
loader = DataLoader('sample_ecg.csv', signal_type='ecg')
df = loader.load(time_column='time')

# 2. Extract signal
ecg_signal = df['ecg'].values
time = df['time'].values

# 3. Basic statistics
print("=== ECG Analysis ===")
print(f"Duration: {time[-1]:.1f} seconds")
print(f"Samples: {len(ecg_signal)}")
print(f"Mean: {ecg_signal.mean():.3f} mV")
print(f"Std Dev: {ecg_signal.std():.3f} mV")
print(f"Min: {ecg_signal.min():.3f} mV")
print(f"Max: {ecg_signal.max():.3f} mV")

# 4. Visualize
plt.figure(figsize=(14, 5))
plt.plot(time, ecg_signal)
plt.xlabel('Time (s)')
plt.ylabel('ECG (mV)')
plt.title('ECG Signal Analysis')
plt.grid(True, alpha=0.3)
plt.show()

Example 2: Batch Processing

# Simulate multiple patient recordings
for i in range(3):
    # Create different signals
    t_patient = np.linspace(0, 5, 1250)
    signal = np.sin(2 * np.pi * (1.0 + i * 0.1) * t_patient)
    signal += 0.1 * np.random.randn(len(t_patient))
    
    df_patient = pd.DataFrame({'time': t_patient, 'ecg': signal})
    df_patient.to_csv(f'patient_{i+1}.csv', index=False)

# Batch process all patients
results = []

for i in range(3):
    filename = f'patient_{i+1}.csv'
    
    # Load
    loader = DataLoader(filename, sampling_rate=250.0)
    data = loader.load()
    
    # Analyze
    signal = data['ecg'].values
    
    results.append({
        'patient': f'Patient {i+1}',
        'samples': len(signal),
        'mean': signal.mean(),
        'std': signal.std()
    })

# Display results
results_df = pd.DataFrame(results)
print("=== Batch Processing Results ===")
print(results_df.to_string(index=False))

Example 3: Multi-Signal Comparison

# Load multi-channel data
channels = load_multi_channel('multi_channel.csv')

# Compare signals
print("=== Multi-Signal Comparison ===")
print(f"{'Signal':<10} {'Mean':<12} {'Std':<12} {'Peak-to-Peak':<15}")
print("-" * 50)

for name, signal in channels.items():
    if name != 'time':
        mean_val = signal.mean()
        std_val = signal.std()
        p2p = signal.max() - signal.min()
        print(f"{name:<10} {mean_val:<12.4f} {std_val:<12.4f} {p2p:<15.4f}")

# Visualize comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, (name, signal) in enumerate(channels.items()):
    if name != 'time':
        axes[idx].hist(signal, bins=50, edgecolor='black', alpha=0.7)
        axes[idx].set_xlabel('Amplitude')
        axes[idx].set_ylabel('Frequency')
        axes[idx].set_title(f'{name} Distribution')
        axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Cleanup

Remove generated files

import os

# List of files to remove
cleanup_files = [
    'sample_ecg.csv', 'sample_ecg.json', 'sample_ecg.npy',
    'multi_signals.npz', 'multi_channel.csv',
    'signal_with_issues.csv',
    'exported_data.csv', 'exported_data.json', 'exported_data.pkl',
    'patient_1.csv', 'patient_2.csv', 'patient_3.csv'
]

for filename in cleanup_files:
    if os.path.exists(filename):
        os.remove(filename)
        print(f"Removed: {filename}")

print("\nCleanup complete!")

Summary

This notebook demonstrated:

  1. ✓ Loading data from multiple formats (CSV, JSON, NumPy, etc.)

  2. ✓ Working with multi-channel physiological signals

  3. ✓ Data validation and quality checks

  4. ✓ Metadata extraction and sampling rate detection

  5. ✓ Data export to various formats

  6. ✓ Advanced features (array/DataFrame loading, format detection)

  7. ✓ Real-world analysis workflows

Next Steps

  • Explore preprocessing with vitalDSP.preprocess

  • Try feature extraction with vitalDSP.physiological_features

  • Analyze signal quality with vitalDSP.signal_quality_assessment

Resources