mieweb · Copilot · Sep 6, 2025 · Sep 6, 2025 · Sep 6, 2025 · Sep 6, 2025
diff --git a/WHISPER_INTEGRATION.md b/WHISPER_INTEGRATION.md
@@ -0,0 +1,170 @@
+# Whisper.cpp Integration Guide
+
+This document describes how the Whisper.cpp integration works in the Pulse app using `whisper.rn`.
+
+## Overview
+
+The app now uses real Whisper.cpp models for speech-to-text transcription instead of mock data. The integration includes:
+
+- Automatic model downloading (ggml-tiny.en.bin)
+- Real-time transcription with timestamps
+- Fallback to demo mode during development
+- Cross-platform support (iOS/Android)
+
+## Implementation Details
+
+### Model Management
+
+The app automatically downloads the `ggml-tiny.en.bin` model (~40MB) from Hugging Face:
+- **URL**: `https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en.bin`
+- **Storage**: Device's document directory
+- **Size**: ~40MB (tiny model, English only)
+
+### Transcription Flow
+
+1. **Initialization**: Download model if not present
+2. **Context Creation**: Initialize Whisper context with the model
+3. **Transcription**: Process audio/video file
+4. **Conversion**: Convert results to app's transcript format
+5. **Storage**: Save transcript with timestamps and metadata
+
+### Platform Configuration
+
+#### iOS Setup
+
+1. **Pods Installation**: Run `npx pod-install` after npm install
+2. **Permissions**: Add microphone permission to `Info.plist` if using realtime transcription:
+   ```xml
+   <key>NSMicrophoneUsageDescription</key>
+   <string>This app requires microphone access for voice transcription</string>
+   ```
+3. **Extended Virtual Addressing**: For larger models, enable in Xcode project capabilities
+
+#### Android Setup
+
+1. **ProGuard**: Add rule to `android/app/proguard-rules.pro`:
+   ```proguard
+   # whisper.rn
+   -keep class com.rnwhisper.** { *; }
+   ```
+2. **Permissions**: Add to `AndroidManifest.xml` for realtime transcription:
+   ```xml
+   <uses-permission android:name="android.permission.RECORD_AUDIO" />
+   ```
+
+## Usage
+
+### Basic Transcription
+
+```typescript
+import { useTranscription } from '../hooks/useTranscription';
+
+const { transcript, isTranscribing, transcribeVideo } = useTranscription(draftId);
+
+// Start transcription
+await transcribeVideo(videoUri, 'en');
+```
+
+### Supported Languages
+
+The implementation supports all Whisper languages including:
+- English (en) - default
+- Spanish (es), French (fr), German (de)
+- Chinese (zh), Japanese (ja), Korean (ko)
+- And many more...
+
+### Error Handling
+
+The implementation includes graceful error handling:
+
+1. **Model Download Failures**: Network connectivity issues
+2. **Transcription Errors**: Unsupported formats, processing failures
+3. **Fallback Mode**: Demo transcripts in development environment
+
+## Performance Notes
+
+### Model Size vs Quality Trade-offs
+
+- **tiny.en** (~40MB): Fast, English-only, good quality for most use cases
+- **base** (~150MB): Better accuracy, multilingual
+- **small** (~500MB): Higher accuracy, slower processing
+- **medium/large**: Require Extended Virtual Addressing on iOS
+
+### Optimization Settings
+
+The implementation uses optimized settings:
+- **Temperature**: 0.0 (deterministic results)
+- **Beam Size**: 5 (quality vs speed balance)
+- **Thread Count**: Platform-optimized (iOS: 4, Android: 2)
+
+## Development vs Production
+
+### Development Mode
+- Always reports as "supported"
+- Falls back to demo transcripts on errors
+- Includes [DEMO] prefix in results
+- Detailed console logging
+
+### Production Mode
+- Strict support checking
+- Real error propagation
+- No fallback transcripts
+- Minimal logging
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Model Download Fails**
+   - Check internet connectivity
+   - Verify storage permissions
+   - Try clearing app data and retry
+
+2. **Transcription Returns Empty Results**
+   - Ensure audio/video file is valid
+   - Check if file format is supported
+   - Verify file isn't corrupted
+
+3. **iOS Build Issues**
+   - Run `npx pod-install`
+   - Clean build folder in Xcode
+   - Ensure correct iOS deployment target
+
+4. **Android Build Issues**
+   - Check NDK version in gradle
+   - Verify ProGuard rules are applied
+   - Clear gradle cache
+
+### Performance Issues
+
+1. **Slow Transcription**
+   - Consider using smaller model (tiny vs base)
+   - Reduce thread count on lower-end devices
+   - Optimize audio file length
+
+2. **Memory Issues**
+   - Release Whisper context when not needed
+   - Use smaller models
+   - Process shorter audio segments
+
+## Future Enhancements
+
+Potential improvements for the integration:
+
+1. **Model Selection**: Allow users to choose model size
+2. **Audio Extraction**: Direct video-to-audio conversion
+3. **Streaming Transcription**: Real-time transcription during recording
+4. **Custom Models**: Support for fine-tuned models
+5. **Background Processing**: Transcribe while app is backgrounded
+
+## Dependencies
+
+- `whisper.rn@^0.4.3`: React Native Whisper.cpp bindings
+- `expo-file-system`: File operations for model storage
+- `@react-native-async-storage/async-storage`: Transcript storage
+
+## References
+
+- [whisper.rn GitHub](https://github.com/mybigday/whisper.rn)
+- [Whisper.cpp Models](https://huggingface.co/ggerganov/whisper.cpp)
+- [OpenAI Whisper](https://github.com/openai/whisper)
diff --git a/__tests__/retiming.test.ts b/__tests__/retiming.test.ts
@@ -0,0 +1,188 @@
+import { RetimingEngine } from '../utils/retiming';
+import { VideoTranscript, TranscriptSegment, EditDecisionList } from '../types/transcription';
+import { RecordingSegment } from '../components/RecordingProgressBar';
+
+describe('RetimingEngine', () => {
+  const mockRecordingSegments: RecordingSegment[] = [
+    {
+      id: '1',
+      duration: 3,
+      uri: 'video1.mp4',
+      inMs: 0,
+      outMs: 3000,
+    },
+    {
+      id: '2', 
+      duration: 2,
+      uri: 'video2.mp4',
+      inMs: 500,
+      outMs: 2500,
+    },
+  ];
+
+  const mockTranscriptSegments: TranscriptSegment[] = [
+    {
+      id: '1',
+      startMs: 0,
+      endMs: 2000,
+      text: 'Hello world',
+      confidence: 0.95,
+      words: [
+        { text: 'Hello', startMs: 0, endMs: 1000, confidence: 0.95 },
+        { text: 'world', startMs: 1000, endMs: 2000, confidence: 0.95 },
+      ],
+    },
+    {
+      id: '2',
+      startMs: 3500,
+      endMs: 5000,
+      text: 'Testing transcription',
+      confidence: 0.90,
+      words: [
+        { text: 'Testing', startMs: 3500, endMs: 4200, confidence: 0.90 },
+        { text: 'transcription', startMs: 4200, endMs: 5000, confidence: 0.90 },
+      ],
+    },
+  ];
+
+  const mockTranscript: VideoTranscript = {
+    id: '1',
+    videoId: 'test-video',
+    segments: mockTranscriptSegments,
+    language: 'en',
+    durationMs: 5000,
+    createdAt: new Date(),
+    model: 'whisper-base',
+    status: 'completed',
+  };
+
+  describe('generateEDLFromSegments', () => {
+    it('should generate correct EDL from recording segments', () => {
+      const edl = RetimingEngine.generateEDLFromSegments(mockRecordingSegments);
+
+      expect(edl.entries).toHaveLength(2);
+
+      // First segment: 0-3000ms maps to 0-3000ms
+      expect(edl.entries[0]).toEqual({
+        originalStartMs: 0,
+        originalEndMs: 3000,
+        newStartMs: 0,
+        newEndMs: 3000,
+        operation: 'keep',
+      });
+
+      // Second segment: 500-2500ms maps to 3000-5000ms
+      expect(edl.entries[1]).toEqual({
+        originalStartMs: 500,
+        originalEndMs: 2500,
+        newStartMs: 3000,
+        newEndMs: 5000,
+        operation: 'keep',
+      });
+
+      expect(edl.newDurationMs).toBe(5000);
+    });
+
+    it('should handle segments without trim points', () => {
+      const segments: RecordingSegment[] = [
+        { id: '1', duration: 2, uri: 'video1.mp4' },
+        { id: '2', duration: 3, uri: 'video2.mp4' },
+      ];
+
+      const edl = RetimingEngine.generateEDLFromSegments(segments);
+
+      expect(edl.entries).toHaveLength(2);
+      expect(edl.entries[0].originalStartMs).toBe(0);
+      expect(edl.entries[0].originalEndMs).toBe(2000);
+      expect(edl.entries[1].originalStartMs).toBe(0);
+      expect(edl.entries[1].originalEndMs).toBe(3000);
+    });
+  });
+
+  describe('retimeTranscript', () => {
+    it('should retime transcript segments correctly', () => {
+      const edl = RetimingEngine.generateEDLFromSegments(mockRecordingSegments);
+      const retimedTranscript = RetimingEngine.retimeTranscript(mockTranscript, edl);
+
+      expect(retimedTranscript.segments).toHaveLength(1);
+
+      // Only the first segment should be kept (0-2000ms fits in 0-3000ms range)
+      const retimedSegment = retimedTranscript.segments[0];
+      expect(retimedSegment.startMs).toBe(0);
+      expect(retimedSegment.endMs).toBe(2000);
+      expect(retimedSegment.words).toHaveLength(2);
+    });
+
+    it('should exclude words outside of kept ranges', () => {
+      const edl: EditDecisionList = {
+        entries: [
+          {
+            originalStartMs: 0,
+            originalEndMs: 1500,
+            newStartMs: 0,
+            newEndMs: 1500,
+            operation: 'keep',
+          },
+        ],
+        videoId: 'test',
+        originalDurationMs: 5000,
+        newDurationMs: 1500,
+      };
+
+      const retimedTranscript = RetimingEngine.retimeTranscript(mockTranscript, edl);
+
+      // Should only include first word (0-1000ms)
+      expect(retimedTranscript.segments).toHaveLength(1);
+      expect(retimedTranscript.segments[0].words).toHaveLength(1);
+      expect(retimedTranscript.segments[0].words[0].text).toBe('Hello');
+    });
+  });
+
+  describe('validateEDL', () => {
+    it('should validate correct EDL', () => {
+      const edl = RetimingEngine.generateEDLFromSegments(mockRecordingSegments);
+      expect(RetimingEngine.validateEDL(edl)).toBe(true);
+    });
+
+    it('should reject empty EDL', () => {
+      const edl: EditDecisionList = {
+        entries: [],
+        videoId: 'test',
+        originalDurationMs: 1000,
+        newDurationMs: 0,
+      };
+      expect(RetimingEngine.validateEDL(edl)).toBe(false);
+    });
+
+    it('should reject EDL with negative duration', () => {
+      const edl: EditDecisionList = {
+        entries: [
+          {
+            originalStartMs: 1000,
+            originalEndMs: 500, // End before start
+            newStartMs: 0,
+            newEndMs: 500,
+            operation: 'keep',
+          },
+        ],
+        videoId: 'test',
+        originalDurationMs: 1000,
+        newDurationMs: 500,
+      };
+      expect(RetimingEngine.validateEDL(edl)).toBe(false);
+    });
+  });
+
+  describe('getRetimingStats', () => {
+    it('should calculate correct retiming statistics', () => {
+      const edl = RetimingEngine.generateEDLFromSegments(mockRecordingSegments);
+      const retimingResult = RetimingEngine.createRetimingResult(mockTranscript, mockRecordingSegments);
+      const stats = RetimingEngine.getRetimingStats(retimingResult);
+
+      expect(stats.originalWordCount).toBe(4); // 2 words in each segment
+      expect(stats.originalDurationMs).toBe(5000);
+      expect(stats.newDurationMs).toBe(5000);
+      expect(stats.compressionRatio).toBe(100);
+    });
+  });
+});