vitalDSP Data Loader Tutorial
This notebook demonstrates the comprehensive data loading capabilities of vitalDSP.
Contents
Basic Loading
Format-Specific Examples
Multi-Channel Data
Data Validation
Metadata Extraction
Data Export
Advanced Features
Real-World Examples
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from vitalDSP.utils.data_processing.data_loader import (
DataLoader,
DataFormat,
SignalType,
load_signal,
load_multi_channel
)
# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline
1. Basic Loading
Creating Sample Data
# Create sample ECG data
fs = 250 # Sampling rate
duration = 10 # seconds
t = np.linspace(0, duration, fs * duration)
# Simulate ECG signal
ecg = np.sin(2 * np.pi * 1.2 * t) + 0.3 * np.sin(2 * np.pi * 2.4 * t)
ecg += 0.1 * np.random.randn(len(t))
# Create DataFrame
df = pd.DataFrame({
'time': t,
'ecg': ecg
})
# Save to CSV
df.to_csv('sample_ecg.csv', index=False)
print(f"Created sample ECG data: {len(df)} samples at {fs} Hz")
Loading CSV Data
# Load the CSV file
loader = DataLoader('sample_ecg.csv', sampling_rate=250.0, signal_type='ecg')
data = loader.load(time_column='time')
print(f"Loaded {len(data)} samples")
print(f"Columns: {list(data.columns)}")
print(f"\nFirst few rows:")
print(data.head())
Quick Loading with Convenience Function
# Quick load
data_quick = load_signal('sample_ecg.csv', sampling_rate=250)
print(f"Quick loaded: {data_quick.shape}")
Visualizing Loaded Data
plt.figure(figsize=(14, 5))
plt.plot(data['time'], data['ecg'])
plt.xlabel('Time (s)')
plt.ylabel('ECG (mV)')
plt.title('Loaded ECG Signal')
plt.grid(True)
plt.show()
2. Format-Specific Examples
JSON Format
import json
# Create JSON data with metadata
json_data = {
'sampling_rate': 250,
'signal_type': 'ecg',
'duration': 10,
'data': [
{'time': float(t), 'ecg': float(e)}
for t, e in zip(data['time'][:100], data['ecg'][:100])
]
}
with open('sample_ecg.json', 'w') as f:
json.dump(json_data, f)
# Load JSON
loader_json = DataLoader('sample_ecg.json')
data_json = loader_json.load()
print(f"Loaded from JSON: {len(data_json)} samples")
print(f"Metadata: {loader_json.metadata}")
NumPy Format
# Save as .npy
np.save('sample_ecg.npy', data['ecg'].values)
# Load .npy
loader_npy = DataLoader('sample_ecg.npy')
data_npy = loader_npy.load()
print(f"Loaded from .npy: {data_npy.shape}")
print(f"Metadata: {loader_npy.metadata}")
Multi-signal NPZ Format
# Create multi-signal data
ppg = 0.5 * np.sin(2 * np.pi * 1.0 * t) + 0.1 * np.random.randn(len(t))
resp = 0.3 * np.sin(2 * np.pi * 0.3 * t) + 0.05 * np.random.randn(len(t))
# Save as .npz
np.savez('multi_signals.npz', ecg=ecg, ppg=ppg, resp=resp, time=t)
# Load .npz
loader_npz = DataLoader('multi_signals.npz')
data_npz = loader_npz.load()
print(f"Loaded signals: {list(data_npz.keys())}")
print(f"ECG shape: {data_npz['ecg'].shape}")
print(f"PPG shape: {data_npz['ppg'].shape}")
3. Multi-Channel Data
Creating Multi-Channel Dataset
# Create multi-channel dataset
df_multi = pd.DataFrame({
'time': t,
'ECG': ecg,
'PPG': ppg,
'RESP': resp
})
df_multi.to_csv('multi_channel.csv', index=False)
print(f"Created multi-channel data: {df_multi.shape}")
print(f"Channels: {list(df_multi.columns)}")
Loading Specific Channels
# Load only ECG and PPG
loader_multi = DataLoader('multi_channel.csv')
data_selected = loader_multi.load(columns=['time', 'ECG', 'PPG'])
print(f"Selected channels: {list(data_selected.columns)}")
Using load_multi_channel
# Load as dictionary of channels
channels = load_multi_channel('multi_channel.csv', channels=['ECG', 'PPG', 'RESP'])
for name, signal in channels.items():
print(f"{name}: {len(signal)} samples, mean={signal.mean():.3f}, std={signal.std():.3f}")
Visualizing Multi-Channel Data
fig, axes = plt.subplots(3, 1, figsize=(14, 10), sharex=True)
# Plot each channel
axes[0].plot(t, channels['ECG'])
axes[0].set_ylabel('ECG (mV)')
axes[0].set_title('ECG Signal')
axes[0].grid(True)
axes[1].plot(t, channels['PPG'])
axes[1].set_ylabel('PPG (AU)')
axes[1].set_title('PPG Signal')
axes[1].grid(True)
axes[2].plot(t, channels['RESP'])
axes[2].set_ylabel('RESP (AU)')
axes[2].set_xlabel('Time (s)')
axes[2].set_title('Respiratory Signal')
axes[2].grid(True)
plt.tight_layout()
plt.show()
4. Data Validation
Creating Data with Issues
# Create data with NaN and Inf
signal_with_issues = ecg.copy()
signal_with_issues[100:110] = np.nan # Add NaN
signal_with_issues[500] = np.inf # Add Inf
df_issues = pd.DataFrame({
'time': t,
'signal': signal_with_issues
})
df_issues.to_csv('signal_with_issues.csv', index=False)
print(f"Created signal with {df_issues['signal'].isna().sum()} NaN values")
print(f"Created signal with {np.isinf(df_issues['signal']).sum()} Inf values")
Loading with Validation
import warnings
# Enable validation (will show warnings)
print("Loading with validation enabled:")
loader_validate = DataLoader('signal_with_issues.csv', validate=True)
data_validated = loader_validate.load()
print(f"\nData loaded: {data_validated.shape}")
5. Metadata Extraction
Extracting Comprehensive Info
# Load with metadata extraction
loader = DataLoader('multi_channel.csv', sampling_rate=250.0, signal_type='ecg')
data = loader.load(time_column='time')
# Get full info
info = loader.get_info()
print("=== Data Information ===")
print(f"File: {info['file_path']}")
print(f"Format: {info['format']}")
print(f"Signal Type: {info['signal_type']}")
print(f"Sampling Rate: {info['sampling_rate']} Hz")
print(f"\nMetadata:")
for key, value in info['metadata'].items():
print(f" {key}: {value}")
Automatic Sampling Rate Detection
# Load without specifying sampling rate
loader_auto = DataLoader('multi_channel.csv')
data_auto = loader_auto.load(time_column='time')
print(f"Computed sampling rate: {loader_auto.metadata.get('computed_sampling_rate', 'N/A')} Hz")
print(f"Expected: 250 Hz")
6. Data Export
Exporting to Multiple Formats
# Load original data
loader = DataLoader('multi_channel.csv')
data = loader.load()
# Export to different formats
print("Exporting to multiple formats...")
loader.export(data, 'exported_data.csv')
print("✓ Exported to CSV")
loader.export(data, 'exported_data.json')
print("✓ Exported to JSON")
loader.export(data, 'exported_data.pkl')
print("✓ Exported to Pickle")
print("\nExport complete!")
Verifying Exported Data
# Load back the exported JSON
loader_verify = DataLoader('exported_data.json')
data_verify = loader_verify.load()
print(f"Original shape: {data.shape}")
print(f"Exported and reloaded shape: {data_verify.shape}")
print(f"\nData integrity: {'✓ PASSED' if data.shape == data_verify.shape else '✗ FAILED'}")
7. Advanced Features
Loading from NumPy Array
# Create array
signal_array = np.random.randn(1000)
# Load from array
loader = DataLoader()
df_from_array = loader.load_from_array(
signal_array,
sampling_rate=250.0,
signal_type='ecg'
)
print(f"Loaded from array: {df_from_array.shape}")
print(f"Sampling rate: {loader.sampling_rate} Hz")
print(f"Signal type: {loader.signal_type.value}")
Loading from DataFrame
# Create DataFrame
df_custom = pd.DataFrame({
'ecg': np.random.randn(500),
'ppg': np.random.randn(500)
})
# Load from DataFrame
loader = DataLoader()
df_loaded = loader.load_from_dataframe(df_custom, sampling_rate=100.0)
print(f"Loaded from DataFrame: {df_loaded.shape}")
print(f"Metadata: {loader.metadata}")
Format Detection
# Test format detection
test_files = [
'data.csv',
'data.json',
'data.xlsx',
'data.npy',
'data.mat'
]
print("=== Format Detection ===")
for filename in test_files:
# Create empty file
Path(filename).touch()
loader = DataLoader(filename)
print(f"{filename:15} -> {loader.format.value}")
# Clean up
Path(filename).unlink()
List Supported Formats
# Get supported formats
formats = DataLoader.list_supported_formats()
print("=== Supported Formats ===")
for i, fmt in enumerate(formats, 1):
if fmt != 'unknown':
req = DataLoader.get_format_requirements(fmt)
if req:
print(f"{i}. {fmt.upper():10} - {req.get('description', 'N/A')}")
else:
print(f"{i}. {fmt.upper()}")
8. Real-World Examples
Example 1: ECG Analysis Pipeline
# Complete ECG analysis workflow
# 1. Load data
loader = DataLoader('sample_ecg.csv', signal_type='ecg')
df = loader.load(time_column='time')
# 2. Extract signal
ecg_signal = df['ecg'].values
time = df['time'].values
# 3. Basic statistics
print("=== ECG Analysis ===")
print(f"Duration: {time[-1]:.1f} seconds")
print(f"Samples: {len(ecg_signal)}")
print(f"Mean: {ecg_signal.mean():.3f} mV")
print(f"Std Dev: {ecg_signal.std():.3f} mV")
print(f"Min: {ecg_signal.min():.3f} mV")
print(f"Max: {ecg_signal.max():.3f} mV")
# 4. Visualize
plt.figure(figsize=(14, 5))
plt.plot(time, ecg_signal)
plt.xlabel('Time (s)')
plt.ylabel('ECG (mV)')
plt.title('ECG Signal Analysis')
plt.grid(True, alpha=0.3)
plt.show()
Example 2: Batch Processing
# Simulate multiple patient recordings
for i in range(3):
# Create different signals
t_patient = np.linspace(0, 5, 1250)
signal = np.sin(2 * np.pi * (1.0 + i * 0.1) * t_patient)
signal += 0.1 * np.random.randn(len(t_patient))
df_patient = pd.DataFrame({'time': t_patient, 'ecg': signal})
df_patient.to_csv(f'patient_{i+1}.csv', index=False)
# Batch process all patients
results = []
for i in range(3):
filename = f'patient_{i+1}.csv'
# Load
loader = DataLoader(filename, sampling_rate=250.0)
data = loader.load()
# Analyze
signal = data['ecg'].values
results.append({
'patient': f'Patient {i+1}',
'samples': len(signal),
'mean': signal.mean(),
'std': signal.std()
})
# Display results
results_df = pd.DataFrame(results)
print("=== Batch Processing Results ===")
print(results_df.to_string(index=False))
Example 3: Multi-Signal Comparison
# Load multi-channel data
channels = load_multi_channel('multi_channel.csv')
# Compare signals
print("=== Multi-Signal Comparison ===")
print(f"{'Signal':<10} {'Mean':<12} {'Std':<12} {'Peak-to-Peak':<15}")
print("-" * 50)
for name, signal in channels.items():
if name != 'time':
mean_val = signal.mean()
std_val = signal.std()
p2p = signal.max() - signal.min()
print(f"{name:<10} {mean_val:<12.4f} {std_val:<12.4f} {p2p:<15.4f}")
# Visualize comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for idx, (name, signal) in enumerate(channels.items()):
if name != 'time':
axes[idx].hist(signal, bins=50, edgecolor='black', alpha=0.7)
axes[idx].set_xlabel('Amplitude')
axes[idx].set_ylabel('Frequency')
axes[idx].set_title(f'{name} Distribution')
axes[idx].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
Cleanup
Remove generated files
import os
# List of files to remove
cleanup_files = [
'sample_ecg.csv', 'sample_ecg.json', 'sample_ecg.npy',
'multi_signals.npz', 'multi_channel.csv',
'signal_with_issues.csv',
'exported_data.csv', 'exported_data.json', 'exported_data.pkl',
'patient_1.csv', 'patient_2.csv', 'patient_3.csv'
]
for filename in cleanup_files:
if os.path.exists(filename):
os.remove(filename)
print(f"Removed: {filename}")
print("\nCleanup complete!")
Summary
This notebook demonstrated:
✓ Loading data from multiple formats (CSV, JSON, NumPy, etc.)
✓ Working with multi-channel physiological signals
✓ Data validation and quality checks
✓ Metadata extraction and sampling rate detection
✓ Data export to various formats
✓ Advanced features (array/DataFrame loading, format detection)
✓ Real-world analysis workflows
Next Steps
Explore preprocessing with
vitalDSP.preprocessTry feature extraction with
vitalDSP.physiological_featuresAnalyze signal quality with
vitalDSP.signal_quality_assessment