From fe7d01fff775d5f29f6fbec2a9fab180152068f6 Mon Sep 17 00:00:00 2001 From: Cod-e-Codes Date: Mon, 29 Sep 2025 17:42:50 -0400 Subject: [PATCH] feat: add optional transcription with multi-provider support - Add transcription.go with whisper.cpp, Vosk, OpenAI API, and Python script providers - Integrate transcription into main app with Ctrl+T keybinding - Add transcription settings to UI with provider status indicators - Show transcription status in memo list and include in search - Auto-transcribe option for new recordings - Complete setup documentation in README --- README.md | 88 ++++++- main.go | 295 ++++++++++++++++++++--- transcription.go | 603 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 945 insertions(+), 41 deletions(-) create mode 100644 transcription.go diff --git a/README.md b/README.md index e7532ae..02d68b1 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ Audio configuration interface displaying hardware/audio settings, available devi - Add tags for organization - Delete memos - Export memos to Downloads folder +- **Optional transcription** with multiple provider support ### User Interface - Terminal user interface using Bubble Tea @@ -115,16 +116,18 @@ go build -o voicelog main.go | `ctrl+x` | Stop playback | | `?` | Show help | | `ctrl+s` | Settings | -| `ctrl+t` | Generate test file | +| `ctrl+t` | Transcribe selected memo | +| `F5` | Generate test file | | `ESC/q` | Quit | ### Basic Operations 1. **Recording**: Press `SPACE` to start/stop recording 2. **Playback**: Select a memo and press `ENTER` to play -3. **Settings**: Press `ctrl+s` to configure audio devices -4. **Test File**: Press `ctrl+t` to generate a 5-second 440Hz test tone -5. **Export**: Press `e` to export selected memo to Downloads folder +3. **Transcription**: Press `ctrl+t` to transcribe selected memo (optional) +4. **Settings**: Press `ctrl+s` to configure audio devices and transcription +5. **Test File**: Press `F5` to generate a 5-second 440Hz test tone +6. **Export**: Press `ctrl+e` to export selected memo to Downloads folder ### Audio Processing Features @@ -146,12 +149,86 @@ VoiceLog includes advanced audio processing capabilities: - **Compact Mode**: Memo list becomes compact when audio visualizer is active - **Real-Time Updates**: Waveform and meters update in real-time during operation +### Transcription (Optional) + +VoiceLog supports optional voice-to-text transcription through a flexible plugin system. Transcription is **completely optional** - the application works perfectly without it. + +#### Supported Transcription Providers + +1. **whisper.cpp (Recommended - Local & Private)** + - High accuracy, supports many languages + - Runs entirely offline - no internet required + - Complete privacy - audio never leaves your machine + - Installation: [github.com/ggerganov/whisper.cpp](https://github.com/ggerganov/whisper.cpp) + +2. **OpenAI Whisper API (Cloud-based - Highest Accuracy)** + - Highest accuracy available + - Requires internet connection and API key + - Install: `pip install openai` + - Set `OPENAI_API_KEY` environment variable + +3. **Vosk (Lightweight & Fast)** + - Smaller models, faster processing + - Good for real-time applications + - Installation: [alphacephei.com/vosk](https://alphacephei.com/vosk/) + +4. **Custom Python Script** + - Use any transcription API (AssemblyAI, Rev.ai, etc.) + - Write your own integration script + - Full flexibility for custom workflows + +#### Quick Setup Examples + +**whisper.cpp Setup (Linux/macOS):** +```bash +# Clone and build whisper.cpp +git clone https://github.com/ggerganov/whisper.cpp +cd whisper.cpp && make + +# Download a model (base.en for English, base for multilingual) +./models/download-ggml-model.sh base.en + +# The whisper binary will be auto-detected by VoiceLog +``` + +**OpenAI Whisper API Setup:** +```bash +# Install the OpenAI library +pip install openai + +# Set your API key (get one from https://platform.openai.com) +export OPENAI_API_KEY="your-api-key-here" +``` + +#### Using Transcription + +1. **Enable in Settings**: Press `ctrl+s` → Navigate to "Transcription:" → Toggle to ON +2. **Select Provider**: Navigate to "Default Provider:" → Choose your installed provider +3. **Transcribe**: Press `ctrl+t` on any memo to transcribe it +4. **Auto-Transcribe**: Enable "Auto Transcribe:" to automatically transcribe new recordings + +#### Transcription Features + +- **Visual Indicators**: Transcribed memos show a 📝 icon in the memo list +- **Search Integration**: Search through transcribed text using the built-in filter +- **Provider Status**: Settings show ✓/✗ status for each provider's availability +- **Flexible Configuration**: Each provider can be configured independently +- **Auto-Detection**: VoiceLog automatically detects available transcription tools + +#### Privacy & Performance + +- **Local Options**: whisper.cpp and Vosk run entirely on your machine +- **Cloud Options**: OpenAI Whisper API provides highest accuracy but requires internet +- **No Telemetry**: VoiceLog never sends any data anywhere (except when using API providers) +- **Storage**: Transcriptions are stored locally alongside memo metadata + ## Configuration Configuration is stored in `~/.voicelog/config.json` and includes: - Audio device settings - Sample rate and format preferences - Audio processing settings (normalization, silence trimming, clipping detection) +- Transcription settings (optional) - Memo storage path - Keybindings @@ -159,8 +236,9 @@ Configuration is stored in `~/.voicelog/config.json` and includes: ``` ~/.voicelog/ ├── config.json # Application configuration +├── transcription.json # Transcription settings (if enabled) ├── memos/ # Voice memo storage -│ ├── metadata.json # Memo metadata +│ ├── metadata.json # Memo metadata (includes transcriptions) │ └── memo_*.wav # Audio files └── voicelog.log # Application logs ``` diff --git a/main.go b/main.go index cda2eff..a7dae1b 100644 --- a/main.go +++ b/main.go @@ -106,14 +106,15 @@ func (f AudioFormat) Extension() string { // Memo represents a voice memo with metadata type Memo struct { - ID string `json:"id"` - Filename string `json:"filename"` - Name string `json:"title"` // Changed from Title to Name to avoid conflict - Duration float64 `json:"duration"` - Created time.Time `json:"created"` - Size int64 `json:"size"` - Tags []string `json:"tags"` - Format string `json:"format"` + ID string `json:"id"` + Filename string `json:"filename"` + Name string `json:"title"` // Changed from Title to Name to avoid conflict + Duration float64 `json:"duration"` + Created time.Time `json:"created"` + Size int64 `json:"size"` + Tags []string `json:"tags"` + Format string `json:"format"` + Transcription *TranscriptionResult `json:"transcription,omitempty"` } // Implement list.Item interface @@ -128,16 +129,27 @@ func (m Memo) Description() string { if len(m.Tags) > 0 { // Truncate tags if they're too long tagString := strings.Join(m.Tags, ", ") - if len(tagString) > 20 { - tagString = tagString[:17] + "..." + if len(tagString) > 15 { // Reduced to make room for transcription indicator + tagString = tagString[:12] + "..." } tags = " [" + tagString + "]" } - return fmt.Sprintf("%s, %s%s", duration, size, tags) + + // Add transcription indicator + transcriptionStatus := "" + if m.Transcription != nil { + transcriptionStatus = " 📝" + } + + return fmt.Sprintf("%s, %s%s%s", duration, size, tags, transcriptionStatus) } func (m Memo) FilterValue() string { - return m.Name + " " + strings.Join(m.Tags, " ") + searchText := m.Name + " " + strings.Join(m.Tags, " ") + if m.Transcription != nil { + searchText += " " + m.Transcription.Text + } + return searchText } // Truncate text to specified length @@ -443,6 +455,9 @@ type Model struct { isClipping bool // Current clipping status peakLevel float32 // Current peak level + // Transcription + transcriptionManager *TranscriptionManager + // UI components textInput textinput.Model help help.Model @@ -467,23 +482,24 @@ type Model struct { // Key bindings type keyMap struct { - Record key.Binding - Play key.Binding - Stop key.Binding - Delete key.Binding - Rename key.Binding - Tag key.Binding - Export key.Binding - Help key.Binding - Settings key.Binding - TestFile key.Binding - Quit key.Binding - Up key.Binding - Down key.Binding - Enter key.Binding - Escape key.Binding - Left key.Binding - Right key.Binding + Record key.Binding + Play key.Binding + Stop key.Binding + Delete key.Binding + Rename key.Binding + Tag key.Binding + Export key.Binding + Help key.Binding + Settings key.Binding + TestFile key.Binding + Transcribe key.Binding + Quit key.Binding + Up key.Binding + Down key.Binding + Enter key.Binding + Escape key.Binding + Left key.Binding + Right key.Binding } // ShortHelp returns keybindings to be shown in the mini help view @@ -494,9 +510,9 @@ func (k keyMap) ShortHelp() []key.Binding { // FullHelp returns keybindings for the expanded help view func (k keyMap) FullHelp() [][]key.Binding { return [][]key.Binding{ - {k.Record, k.Play, k.Stop, k.Up, k.Down}, // Core controls - {k.Rename, k.Tag, k.Delete, k.Export}, // Management - {k.Settings, k.TestFile, k.Help, k.Quit}, // Other + {k.Record, k.Play, k.Stop, k.Up, k.Down}, // Core controls + {k.Rename, k.Tag, k.Delete, k.Export}, // Management + {k.Transcribe, k.Settings, k.TestFile, k.Help, k.Quit}, // Other } } @@ -538,8 +554,12 @@ var keys = keyMap{ key.WithHelp("ctrl+s", "settings"), ), TestFile: key.NewBinding( + key.WithKeys("f5"), + key.WithHelp("f5", "test file"), + ), + Transcribe: key.NewBinding( key.WithKeys("ctrl+t"), - key.WithHelp("ctrl+t", "test file"), + key.WithHelp("ctrl+t", "transcribe"), ), Quit: key.NewBinding( key.WithKeys("q", "ctrl+c"), @@ -677,6 +697,10 @@ func initialModel() Model { audioAnalyzer := NewAudioAnalyzer(config.SampleRate, config.WaveformSampleRate) clippingDetector := NewClippingDetector(config.ClippingThreshold) + // Initialize transcription manager + homeDir, _ := os.UserHomeDir() + transcriptionManager := NewTranscriptionManager(filepath.Join(homeDir, ConfigDir)) + return Model{ state: StateViewing, config: config, @@ -693,6 +717,9 @@ func initialModel() Model { audioAnalyzer: audioAnalyzer, clippingDetector: clippingDetector, realtimeWaveform: make([]float32, config.WaveformSampleRate), + + // Transcription + transcriptionManager: transcriptionManager, } } @@ -1033,11 +1060,19 @@ func (m Model) handleSettingsKeys(msg tea.KeyMsg) (tea.Model, tea.Cmd) { case key.Matches(msg, keys.Up): if m.settingsSelectedIdx > 0 { m.settingsSelectedIdx-- + // Skip separator line + if m.settingsSelectedIdx == 12 && m.settingsSelectedIdx > 0 { + m.settingsSelectedIdx-- + } } case key.Matches(msg, keys.Down): - if m.settingsSelectedIdx < 11 { // 12 settings items (0-11) + if m.settingsSelectedIdx < 15 { // 16 settings items (0-15) - added transcription settings m.settingsSelectedIdx++ + // Skip separator line + if m.settingsSelectedIdx == 12 && m.settingsSelectedIdx < 15 { + m.settingsSelectedIdx++ + } } case key.Matches(msg, keys.Left): @@ -1232,6 +1267,37 @@ func (m *Model) adjustSetting(delta int) { if m.clippingDetector != nil { m.clippingDetector.threshold = m.config.ClippingThreshold } + case 13: // Transcription Enabled + enabled := !m.transcriptionManager.GetConfig().Enabled + if err := m.transcriptionManager.SetEnabled(enabled); err != nil { + log.Printf("Error setting transcription enabled: %v", err) + } + case 14: // Default Provider + providers := m.transcriptionManager.GetAllProviders() + currentProvider := m.transcriptionManager.GetConfig().DefaultProvider + currentIdx := -1 + for i, provider := range providers { + if provider == currentProvider { + currentIdx = i + break + } + } + if currentIdx >= 0 { + nextIdx := (currentIdx + delta + len(providers)) % len(providers) + if err := m.transcriptionManager.SetDefaultProvider(providers[nextIdx]); err != nil { + log.Printf("Error setting default provider: %v", err) + } + } else if len(providers) > 0 { + // Set first provider if no current provider + if err := m.transcriptionManager.SetDefaultProvider(providers[0]); err != nil { + log.Printf("Error setting default provider: %v", err) + } + } + case 15: // Auto Transcribe + auto := !m.transcriptionManager.GetConfig().AutoTranscribe + if err := m.transcriptionManager.SetAutoTranscribe(auto); err != nil { + log.Printf("Error setting auto-transcribe: %v", err) + } } } @@ -1336,6 +1402,11 @@ func (m Model) handleMainKeys(msg tea.KeyMsg) (tea.Model, tea.Cmd) { case key.Matches(msg, keys.TestFile): m.loadTestFile() + case key.Matches(msg, keys.Transcribe): + if len(m.memos) > 0 { + m.transcribeMemo() + } + case key.Matches(msg, keys.Record): if m.recording { m.stopRecording() @@ -1726,6 +1797,10 @@ func (m *Model) stopRecording() { // Add to memos list m.memos = append([]Memo{memo}, m.memos...) + + // Auto-transcribe if enabled + m.autoTranscribeMemo(&memo) + // Refresh list items to include the new memo m.memoList.SetItems(convertMemosToListItems(m.memos)) @@ -2056,8 +2131,13 @@ func (m Model) renderSettings() string { "Auto Trim Silence:", "Silence Threshold:", "Clipping Threshold:", + "", // Separator + "Transcription:", + "Default Provider:", + "Auto Transcribe:", } + transcriptionConfig := m.transcriptionManager.GetConfig() values := []string{ m.getDeviceName(m.config.InputDevice), m.getDeviceName(m.config.OutputDevice), @@ -2071,10 +2151,20 @@ func (m Model) renderSettings() string { boolToString(m.config.AutoTrimSilence), fmt.Sprintf("%.1f%%", m.config.SilenceThreshold*100), fmt.Sprintf("%.0f%%", m.config.ClippingThreshold*100), + "", // Separator value + boolToString(transcriptionConfig.Enabled), + m.getTranscriptionProviderDisplay(transcriptionConfig.DefaultProvider), + boolToString(transcriptionConfig.AutoTranscribe), } var lines []string for i, setting := range settings { + // Skip empty separator settings + if setting == "" && i == 12 { + lines = append(lines, "") + continue + } + var line string if i == m.settingsSelectedIdx { line += selectedStyle.Render("▶ ") @@ -2086,8 +2176,8 @@ func (m Model) renderSettings() string { line += " " line += successStyle.Render(values[i]) - // Add arrows for navigation - if i == m.settingsSelectedIdx { + // Add arrows for navigation (skip separator) + if i == m.settingsSelectedIdx && i != 12 { line += " " + mutedStyle.Render("← →") } @@ -2132,6 +2222,50 @@ func (m Model) getDeviceName(deviceID string) string { return fmt.Sprintf("Unknown Device (ID: %s)", deviceID) } +// Get transcription provider display name +func (m Model) getTranscriptionProviderDisplay(providerName string) string { + if providerName == "" { + return "None" + } + + available := m.transcriptionManager.GetAvailableProviders() + isAvailable := false + for _, name := range available { + if name == providerName { + isAvailable = true + break + } + } + + if isAvailable { + switch providerName { + case "whisper.cpp": + return "Whisper.cpp ✓" + case "vosk": + return "Vosk ✓" + case "openai_whisper": + return "OpenAI Whisper ✓" + case "python_script": + return "Custom Script ✓" + default: + return providerName + " ✓" + } + } else { + switch providerName { + case "whisper.cpp": + return "Whisper.cpp ✗" + case "vosk": + return "Vosk ✗" + case "openai_whisper": + return "OpenAI Whisper ✗" + case "python_script": + return "Custom Script ✗" + default: + return providerName + " ✗" + } + } +} + // Get system audio info func getSystemAudioInfo() string { // Avoid initializing PortAudio here to prevent strict init/term cycles on some platforms @@ -2943,6 +3077,95 @@ func renderPeakBar(level float32, width int) string { return bar } +// Transcribe the currently selected memo +func (m *Model) transcribeMemo() { + if len(m.memos) == 0 { + m.showNotification("No memo selected") + return + } + + memo := &m.memos[m.selectedIdx] + if memo.Transcription != nil { + m.showNotification("Memo already transcribed") + return + } + + if !m.transcriptionManager.GetConfig().Enabled { + m.showNotification("Transcription is disabled - enable in settings") + return + } + + filePath := filepath.Join(m.config.MemosPath, memo.Filename) + + // Check if file exists + if _, err := os.Stat(filePath); os.IsNotExist(err) { + m.showNotification("Audio file not found") + return + } + + m.showNotification("Transcribing...") + + // Run transcription in background (simplified for TUI) + result, err := m.transcriptionManager.Transcribe(filePath, "") + if err != nil { + m.showNotification(fmt.Sprintf("Transcription failed: %v", err)) + return + } + + // Store transcription result in memo + result.MemoID = memo.ID + memo.Transcription = result + + // Update the memo in the list + for i := range m.memos { + if m.memos[i].ID == memo.ID { + m.memos[i].Transcription = result + break + } + } + + // Save metadata + if err := saveMemos(m.memos, m.config.MemosPath); err != nil { + log.Printf("Error saving memos metadata: %v", err) + m.showNotification("Error saving transcription") + } else { + m.showNotification("Transcription completed!") + } + + // Refresh list items + m.memoList.SetItems(convertMemosToListItems(m.memos)) +} + +// Auto-transcribe a memo if enabled +func (m *Model) autoTranscribeMemo(memo *Memo) { + if !m.transcriptionManager.GetConfig().Enabled || !m.transcriptionManager.GetConfig().AutoTranscribe { + return + } + + if memo.Transcription != nil { + return // Already transcribed + } + + filePath := filepath.Join(m.config.MemosPath, memo.Filename) + + // Run transcription in background + go func() { + result, err := m.transcriptionManager.Transcribe(filePath, "") + if err != nil { + log.Printf("Auto-transcription failed: %v", err) + return + } + + result.MemoID = memo.ID + memo.Transcription = result + + // Update metadata + if err := saveMemos(m.memos, m.config.MemosPath); err != nil { + log.Printf("Error saving auto-transcription: %v", err) + } + }() +} + // Main function func main() { setupLogging() diff --git a/transcription.go b/transcription.go new file mode 100644 index 0000000..b48ae05 --- /dev/null +++ b/transcription.go @@ -0,0 +1,603 @@ +package main + +import ( + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "time" +) + +// TranscriptionProvider represents a plugin interface for transcription services +type TranscriptionProvider interface { + Name() string + IsAvailable() bool + Transcribe(audioPath string) (string, error) + Configure(config map[string]string) error +} + +// TranscriptionConfig holds transcription settings +type TranscriptionConfig struct { + Enabled bool `json:"enabled"` + DefaultProvider string `json:"default_provider"` + AutoTranscribe bool `json:"auto_transcribe"` + ProviderConfigs map[string]map[string]string `json:"provider_configs"` +} + +// TranscriptionResult holds the transcription output +type TranscriptionResult struct { + MemoID string `json:"memo_id"` + Text string `json:"text"` + Provider string `json:"provider"` + Confidence float64 `json:"confidence,omitempty"` + Language string `json:"language,omitempty"` + TranscribedAt string `json:"transcribed_at"` +} + +// ============================================================================ +// WHISPER.CPP PROVIDER (External Command) +// ============================================================================ + +type WhisperCppProvider struct { + execPath string + modelPath string + language string +} + +func NewWhisperCppProvider() *WhisperCppProvider { + return &WhisperCppProvider{ + language: "en", + } +} + +func (w *WhisperCppProvider) Name() string { + return "whisper.cpp" +} + +func (w *WhisperCppProvider) IsAvailable() bool { + // Check if whisper executable exists in PATH or configured location + if w.execPath != "" { + if _, err := os.Stat(w.execPath); err == nil { + return true + } + } + + // Check common locations + commonPaths := []string{ + "whisper", + "./whisper", + "/usr/local/bin/whisper", + "/usr/bin/whisper", + "whisper.exe", // Windows + "./whisper.exe", + } + + for _, path := range commonPaths { + if _, err := exec.LookPath(path); err == nil { + w.execPath = path + return true + } + } + + return false +} + +func (w *WhisperCppProvider) Configure(config map[string]string) error { + if path, ok := config["exec_path"]; ok { + w.execPath = path + } + if path, ok := config["model_path"]; ok { + w.modelPath = path + } + if lang, ok := config["language"]; ok { + w.language = lang + } + return nil +} + +func (w *WhisperCppProvider) Transcribe(audioPath string) (string, error) { + if !w.IsAvailable() { + return "", fmt.Errorf("whisper.cpp not found in PATH") + } + + args := []string{"-f", audioPath} + + // Add model path if configured + if w.modelPath != "" { + args = append(args, "-m", w.modelPath) + } + + // Add language + args = append(args, "-l", w.language) + + // Output to text file + args = append(args, "-otxt") + + cmd := exec.Command(w.execPath, args...) + output, err := cmd.CombinedOutput() + if err != nil { + return "", fmt.Errorf("whisper.cpp failed: %v\nOutput: %s", err, output) + } + + // Read the generated text file + txtFile := strings.TrimSuffix(audioPath, filepath.Ext(audioPath)) + ".txt" + text, err := os.ReadFile(txtFile) + if err != nil { + return "", fmt.Errorf("failed to read transcription: %v", err) + } + + // Clean up temp file + os.Remove(txtFile) + + return strings.TrimSpace(string(text)), nil +} + +// ============================================================================ +// VOSK PROVIDER (External Command) +// ============================================================================ + +type VoskProvider struct { + execPath string + modelPath string +} + +func NewVoskProvider() *VoskProvider { + return &VoskProvider{} +} + +func (v *VoskProvider) Name() string { + return "vosk" +} + +func (v *VoskProvider) IsAvailable() bool { + if v.execPath != "" { + if _, err := os.Stat(v.execPath); err == nil { + return true + } + } + + // Check for vosk-transcriber or similar + commonPaths := []string{ + "vosk-transcriber", + "vosk", + "./vosk-transcriber", + "vosk-transcriber.exe", // Windows + } + + for _, path := range commonPaths { + if _, err := exec.LookPath(path); err == nil { + v.execPath = path + return true + } + } + + return false +} + +func (v *VoskProvider) Configure(config map[string]string) error { + if path, ok := config["exec_path"]; ok { + v.execPath = path + } + if path, ok := config["model_path"]; ok { + v.modelPath = path + } + return nil +} + +func (v *VoskProvider) Transcribe(audioPath string) (string, error) { + if !v.IsAvailable() { + return "", fmt.Errorf("vosk not found") + } + + args := []string{audioPath} + if v.modelPath != "" { + args = append([]string{"-m", v.modelPath}, args...) + } + + cmd := exec.Command(v.execPath, args...) + output, err := cmd.CombinedOutput() + if err != nil { + return "", fmt.Errorf("vosk failed: %v", err) + } + + return strings.TrimSpace(string(output)), nil +} + +// ============================================================================ +// PYTHON SCRIPT PROVIDER (for users with custom scripts) +// ============================================================================ + +type PythonScriptProvider struct { + scriptPath string +} + +func NewPythonScriptProvider() *PythonScriptProvider { + return &PythonScriptProvider{} +} + +func (p *PythonScriptProvider) Name() string { + return "python_script" +} + +func (p *PythonScriptProvider) IsAvailable() bool { + if p.scriptPath == "" { + return false + } + + if _, err := os.Stat(p.scriptPath); err != nil { + return false + } + + // Check if python is available + for _, pythonCmd := range []string{"python3", "python", "py"} { + if _, err := exec.LookPath(pythonCmd); err == nil { + return true + } + } + + return false +} + +func (p *PythonScriptProvider) Configure(config map[string]string) error { + if path, ok := config["script_path"]; ok { + p.scriptPath = path + } + return nil +} + +func (p *PythonScriptProvider) Transcribe(audioPath string) (string, error) { + if !p.IsAvailable() { + return "", fmt.Errorf("python script not configured or python not found") + } + + // Try python commands in order of preference + var pythonCmd string + for _, cmd := range []string{"python3", "python", "py"} { + if _, err := exec.LookPath(cmd); err == nil { + pythonCmd = cmd + break + } + } + + if pythonCmd == "" { + return "", fmt.Errorf("no python interpreter found") + } + + cmd := exec.Command(pythonCmd, p.scriptPath, audioPath) + output, err := cmd.CombinedOutput() + if err != nil { + return "", fmt.Errorf("python script failed: %v\nOutput: %s", err, output) + } + + return strings.TrimSpace(string(output)), nil +} + +// ============================================================================ +// OPENAI WHISPER API PROVIDER (via Python) +// ============================================================================ + +type OpenAIWhisperProvider struct { + apiKey string + pythonPath string +} + +func NewOpenAIWhisperProvider() *OpenAIWhisperProvider { + return &OpenAIWhisperProvider{} +} + +func (o *OpenAIWhisperProvider) Name() string { + return "openai_whisper" +} + +func (o *OpenAIWhisperProvider) IsAvailable() bool { + // Check if API key is configured + if o.apiKey == "" { + if envKey := os.Getenv("OPENAI_API_KEY"); envKey == "" { + return false + } + } + + // Check if python is available + for _, pythonCmd := range []string{"python3", "python", "py"} { + if _, err := exec.LookPath(pythonCmd); err == nil { + return true + } + } + + return false +} + +func (o *OpenAIWhisperProvider) Configure(config map[string]string) error { + if key, ok := config["api_key"]; ok { + o.apiKey = key + } + if path, ok := config["python_path"]; ok { + o.pythonPath = path + } + return nil +} + +func (o *OpenAIWhisperProvider) Transcribe(audioPath string) (string, error) { + if !o.IsAvailable() { + return "", fmt.Errorf("OpenAI Whisper API not configured") + } + + // Create a temporary Python script for OpenAI API call + script := ` +import openai +import sys +import os + +# Set API key +api_key = os.getenv('OPENAI_API_KEY') +if not api_key: + api_key = '` + o.apiKey + `' + +client = openai.OpenAI(api_key=api_key) + +# Transcribe audio file +try: + with open(sys.argv[1], 'rb') as audio_file: + transcript = client.audio.transcriptions.create( + model="whisper-1", + file=audio_file + ) + print(transcript.text) +except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) +` + + // Write temp script + tempScript := filepath.Join(os.TempDir(), "voicelog_openai_transcribe.py") + if err := os.WriteFile(tempScript, []byte(script), 0600); err != nil { + return "", fmt.Errorf("failed to create temp script: %v", err) + } + defer os.Remove(tempScript) + + // Find python command + var pythonCmd string + for _, cmd := range []string{"python3", "python", "py"} { + if _, err := exec.LookPath(cmd); err == nil { + pythonCmd = cmd + break + } + } + + // Set API key in environment if configured + env := os.Environ() + if o.apiKey != "" { + env = append(env, "OPENAI_API_KEY="+o.apiKey) + } + + cmd := exec.Command(pythonCmd, tempScript, audioPath) + cmd.Env = env + output, err := cmd.CombinedOutput() + if err != nil { + return "", fmt.Errorf("OpenAI Whisper API failed: %v\nOutput: %s", err, output) + } + + return strings.TrimSpace(string(output)), nil +} + +// ============================================================================ +// TRANSCRIPTION MANAGER +// ============================================================================ + +type TranscriptionManager struct { + providers map[string]TranscriptionProvider + config TranscriptionConfig + configDir string +} + +func NewTranscriptionManager(configDir string) *TranscriptionManager { + tm := &TranscriptionManager{ + providers: make(map[string]TranscriptionProvider), + configDir: configDir, + config: TranscriptionConfig{ + Enabled: false, + DefaultProvider: "", + AutoTranscribe: false, + ProviderConfigs: make(map[string]map[string]string), + }, + } + + // Register available providers + tm.RegisterProvider(NewWhisperCppProvider()) + tm.RegisterProvider(NewVoskProvider()) + tm.RegisterProvider(NewPythonScriptProvider()) + tm.RegisterProvider(NewOpenAIWhisperProvider()) + + // Load config + tm.LoadConfig() + + // Configure providers from saved config + for name, provider := range tm.providers { + if providerConfig, ok := tm.config.ProviderConfigs[name]; ok { + provider.Configure(providerConfig) + } + } + + return tm +} + +func (tm *TranscriptionManager) RegisterProvider(provider TranscriptionProvider) { + tm.providers[provider.Name()] = provider +} + +func (tm *TranscriptionManager) GetAvailableProviders() []string { + var available []string + for name, provider := range tm.providers { + if provider.IsAvailable() { + available = append(available, name) + } + } + return available +} + +func (tm *TranscriptionManager) GetAllProviders() []string { + var all []string + for name := range tm.providers { + all = append(all, name) + } + return all +} + +func (tm *TranscriptionManager) IsProviderAvailable(name string) bool { + if provider, ok := tm.providers[name]; ok { + return provider.IsAvailable() + } + return false +} + +func (tm *TranscriptionManager) Transcribe(audioPath string, providerName string) (*TranscriptionResult, error) { + if !tm.config.Enabled { + return nil, fmt.Errorf("transcription is disabled") + } + + // Use default provider if none specified + if providerName == "" { + providerName = tm.config.DefaultProvider + } + + if providerName == "" { + return nil, fmt.Errorf("no default provider configured") + } + + provider, ok := tm.providers[providerName] + if !ok { + return nil, fmt.Errorf("provider not found: %s", providerName) + } + + if !provider.IsAvailable() { + return nil, fmt.Errorf("provider not available: %s", providerName) + } + + text, err := provider.Transcribe(audioPath) + if err != nil { + return nil, err + } + + result := &TranscriptionResult{ + Text: text, + Provider: providerName, + TranscribedAt: time.Now().Format(time.RFC3339), + } + + return result, nil +} + +func (tm *TranscriptionManager) LoadConfig() error { + configPath := filepath.Join(tm.configDir, "transcription.json") + + data, err := os.ReadFile(configPath) + if err != nil { + if os.IsNotExist(err) { + return nil // Use defaults + } + return err + } + + return json.Unmarshal(data, &tm.config) +} + +func (tm *TranscriptionManager) SaveConfig() error { + configPath := filepath.Join(tm.configDir, "transcription.json") + + data, err := json.MarshalIndent(tm.config, "", " ") + if err != nil { + return err + } + + return os.WriteFile(configPath, data, 0644) +} + +// ConfigureProvider updates a provider's configuration +func (tm *TranscriptionManager) ConfigureProvider(providerName string, config map[string]string) error { + provider, ok := tm.providers[providerName] + if !ok { + return fmt.Errorf("provider not found: %s", providerName) + } + + if err := provider.Configure(config); err != nil { + return err + } + + // Save to config + if tm.config.ProviderConfigs == nil { + tm.config.ProviderConfigs = make(map[string]map[string]string) + } + tm.config.ProviderConfigs[providerName] = config + return tm.SaveConfig() +} + +// SetEnabled enables or disables transcription +func (tm *TranscriptionManager) SetEnabled(enabled bool) error { + tm.config.Enabled = enabled + return tm.SaveConfig() +} + +// SetDefaultProvider sets the default transcription provider +func (tm *TranscriptionManager) SetDefaultProvider(providerName string) error { + if _, ok := tm.providers[providerName]; !ok { + return fmt.Errorf("provider not found: %s", providerName) + } + tm.config.DefaultProvider = providerName + return tm.SaveConfig() +} + +// SetAutoTranscribe enables or disables auto-transcription +func (tm *TranscriptionManager) SetAutoTranscribe(auto bool) error { + tm.config.AutoTranscribe = auto + return tm.SaveConfig() +} + +// GetConfig returns the current transcription configuration +func (tm *TranscriptionManager) GetConfig() TranscriptionConfig { + return tm.config +} + +// ShowSetupInstructions prints setup instructions for transcription providers +func ShowTranscriptionSetupInstructions() { + fmt.Println("=== VoiceLog Transcription Setup ===") + fmt.Println() + fmt.Println("VoiceLog supports optional transcription through external tools.") + fmt.Println("No installation required - configure only if you want transcription.") + fmt.Println() + fmt.Println("Supported transcription engines:") + fmt.Println() + fmt.Println("1. whisper.cpp (Recommended - Local, Private)") + fmt.Println(" - High accuracy, supports many languages") + fmt.Println(" - Installation: https://github.com/ggerganov/whisper.cpp") + fmt.Println(" - Download model: https://huggingface.co/ggerganov/whisper.cpp") + fmt.Println(" - Quick start:") + fmt.Println(" git clone https://github.com/ggerganov/whisper.cpp") + fmt.Println(" cd whisper.cpp && make") + fmt.Println(" ./models/download-ggml-model.sh base.en") + fmt.Println() + fmt.Println("2. Vosk (Lightweight, Offline)") + fmt.Println(" - Fast, good for real-time transcription") + fmt.Println(" - Installation: https://alphacephei.com/vosk/") + fmt.Println(" - Download models from: https://alphacephei.com/vosk/models") + fmt.Println() + fmt.Println("3. OpenAI Whisper API (Cloud-based)") + fmt.Println(" - Highest accuracy, requires internet & API key") + fmt.Println(" - Set OPENAI_API_KEY environment variable") + fmt.Println(" - Install: pip install openai") + fmt.Println() + fmt.Println("4. Custom Python Script") + fmt.Println(" - Use your own script with any API (AssemblyAI, Rev.ai, etc.)") + fmt.Println(" - Script should accept audio file path and output text") + fmt.Println(" - Example template available in documentation") + fmt.Println() + fmt.Println("Configuration:") + fmt.Println(" Press Ctrl+S -> Navigate to 'Transcription Settings'") + fmt.Println(" Enable transcription and select your provider") + fmt.Println() + fmt.Println("Usage:") + fmt.Println(" Press Ctrl+T to transcribe the selected memo") + fmt.Println(" Enable auto-transcribe to automatically transcribe new recordings") + fmt.Println() +}