From 2b1ef2ebb92de8ced7f547c95ca6108d02fd1d3b Mon Sep 17 00:00:00 2001 From: Ankush Malaker <43288948+AnkushMalaker@users.noreply.github.com> Date: Fri, 19 Sep 2025 15:18:38 +0000 Subject: [PATCH 1/7] WIP processing view --- .../controllers/system_controller.py | 133 +++++++- .../src/advanced_omi_backend/processors.py | 95 ++++++ .../routers/modules/system_routes.py | 23 ++ backends/advanced/webui/src/App.tsx | 6 + .../webui/src/components/layout/Layout.tsx | 3 +- .../components/processes/ActiveTasksTable.tsx | 255 +++++++++++++++ .../processes/ClientDetailModal.tsx | 300 ++++++++++++++++++ .../processes/ProcessPipelineView.tsx | 206 ++++++++++++ .../processes/ProcessingHistory.tsx | 213 +++++++++++++ .../processes/SystemHealthCards.tsx | 135 ++++++++ .../advanced/webui/src/pages/Processes.tsx | 214 +++++++++++++ backends/advanced/webui/src/pages/System.tsx | 96 +----- backends/advanced/webui/src/services/api.ts | 7 + 13 files changed, 1586 insertions(+), 100 deletions(-) create mode 100644 backends/advanced/webui/src/components/processes/ActiveTasksTable.tsx create mode 100644 backends/advanced/webui/src/components/processes/ClientDetailModal.tsx create mode 100644 backends/advanced/webui/src/components/processes/ProcessPipelineView.tsx create mode 100644 backends/advanced/webui/src/components/processes/ProcessingHistory.tsx create mode 100644 backends/advanced/webui/src/components/processes/SystemHealthCards.tsx create mode 100644 backends/advanced/webui/src/pages/Processes.tsx diff --git a/backends/advanced/src/advanced_omi_backend/controllers/system_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/system_controller.py index 9fc7efe6..d863985f 100644 --- a/backends/advanced/src/advanced_omi_backend/controllers/system_controller.py +++ b/backends/advanced/src/advanced_omi_backend/controllers/system_controller.py @@ -1139,23 +1139,146 @@ async def delete_all_user_memories(user: User): """Delete all memories for the current user.""" try: from advanced_omi_backend.memory import get_memory_service - + memory_service = get_memory_service() - + # Delete all memories for the user deleted_count = await memory_service.delete_all_user_memories(user.user_id) - + logger.info(f"Deleted {deleted_count} memories for user {user.user_id}") - + return { "message": f"Successfully deleted {deleted_count} memories", "deleted_count": deleted_count, "user_id": user.user_id, "status": "success" } - + except Exception as e: logger.error(f"Error deleting all memories for user {user.user_id}: {e}") return JSONResponse( status_code=500, content={"error": f"Failed to delete memories: {str(e)}"} ) + + +async def get_processor_overview(): + """Get comprehensive processor overview with pipeline stats.""" + try: + processor_manager = get_processor_manager() + task_manager = get_task_manager() + + # Get pipeline statistics + pipeline_stats = processor_manager.get_pipeline_statistics() + + # Get system health metrics + task_health = task_manager.get_health_status() + queue_health = processor_manager.get_queue_health_status() + + # Get recent activity + recent_activity = processor_manager.get_processing_history(limit=10) + + overview = { + "pipeline_stats": pipeline_stats, + "system_health": { + "total_active_clients": len(processor_manager.active_file_sinks), + "total_processing_tasks": len(processor_manager.processing_tasks), + "task_manager_healthy": task_health.get("healthy", False), + "error_rate": task_health.get("recent_errors", 0) / max(task_health.get("completed_tasks", 1), 1), + "uptime_hours": time.time() / 3600 # Placeholder + }, + "queue_health": queue_health, + "recent_activity": recent_activity[:5] # Last 5 activities + } + + return overview + except Exception as e: + logger.error(f"Error getting processor overview: {e}") + return JSONResponse( + status_code=500, content={"error": f"Failed to get processor overview: {str(e)}"} + ) + +async def get_processor_history(page: int = 1, per_page: int = 50): + """Get paginated processing history.""" + try: + processor_manager = get_processor_manager() + + # Calculate offset + offset = (page - 1) * per_page + + # Get full history and paginate + full_history = processor_manager.get_processing_history(limit=1000) # Get more for pagination + total_items = len(full_history) + + # Paginate + paginated_history = full_history[offset:offset + per_page] + + return { + "history": paginated_history, + "pagination": { + "page": page, + "per_page": per_page, + "total": total_items, + "total_pages": (total_items + per_page - 1) // per_page + } + } + except Exception as e: + logger.error(f"Error getting processor history: {e}") + return JSONResponse( + status_code=500, content={"error": f"Failed to get processor history: {str(e)}"} + ) + +async def get_client_processing_detail(client_id: str): + """Get detailed processing information for specific client.""" + try: + from advanced_omi_backend.client_manager import get_client_manager + + processor_manager = get_processor_manager() + client_manager = get_client_manager() + + # Get processing status first - this may have data even if client is inactive + processing_status = processor_manager.get_processing_status(client_id) + + # Get task manager tasks for this client + task_manager = get_task_manager() + client_tasks = task_manager.get_tasks_for_client(client_id) + + # Try to get client info, but don't fail if client is inactive + client = client_manager.get_client(client_id) + + # If no client and no processing data, return 404 + if not client and not processing_status.get("stages") and not client_tasks: + return JSONResponse( + status_code=404, content={"error": f"No data found for client {client_id}"} + ) + + detail = { + "client_id": client_id, + "client_info": { + "user_id": getattr(client, "user_id", "unknown") if client else "unknown", + "user_email": getattr(client, "user_email", "unknown") if client else "unknown", + "current_audio_uuid": getattr(client, "current_audio_uuid", None) if client else None, + "conversation_start_time": getattr(client, "conversation_start_time", None) if client else None, + "sample_rate": getattr(client, "sample_rate", None) if client else None, + "status": "active" if client else "inactive" + }, + "processing_status": processing_status, + "active_tasks": [ + { + "task_id": f"{task.name}_{id(task.task)}", + "task_name": task.name, + "task_type": task.metadata.get("type", "unknown"), + "created_at": datetime.fromtimestamp(task.created_at, UTC).isoformat(), + "completed_at": datetime.fromtimestamp(task.completed_at, UTC).isoformat() if task.completed_at else None, + "error": task.error, + "cancelled": task.cancelled + } + for task in client_tasks + ] + } + + return detail + except Exception as e: + logger.error(f"Error getting client processing detail for {client_id}: {e}") + return JSONResponse( + status_code=500, content={"error": f"Failed to get client detail: {str(e)}"} + ) diff --git a/backends/advanced/src/advanced_omi_backend/processors.py b/backends/advanced/src/advanced_omi_backend/processors.py index 386a671c..2b21d72e 100644 --- a/backends/advanced/src/advanced_omi_backend/processors.py +++ b/backends/advanced/src/advanced_omi_backend/processors.py @@ -451,6 +451,101 @@ def get_all_processing_status(self) -> dict[str, Any]: all_client_ids = set(self.processing_tasks.keys()) | set(self.processing_state.keys()) return {client_id: self.get_processing_status(client_id) for client_id in all_client_ids} + def get_pipeline_statistics(self) -> dict[str, Any]: + """Calculate pipeline performance metrics for each processing stage.""" + import time + from statistics import mean + + current_time = time.time() + + # Calculate stats for each queue + pipeline_stats = {} + + # Audio Queue Stats + audio_tasks = [] + for client_id, state in self.processing_state.items(): + audio_stage = state.get("audio", {}) + if audio_stage.get("status") == "completed": + audio_tasks.append({ + "duration": audio_stage.get("metadata", {}).get("processing_time", 1.0), + "timestamp": audio_stage.get("timestamp", current_time) + }) + + pipeline_stats["audio"] = { + "queue_size": self.audio_queue.qsize(), + "active_tasks": sum(1 for state in self.processing_state.values() + if state.get("audio", {}).get("status") == "started"), + "avg_processing_time_ms": mean([t["duration"] * 1000 for t in audio_tasks[-50:]]) if audio_tasks else 0, + "success_rate": len([t for t in audio_tasks[-100:] if t]) / max(len(audio_tasks[-100:]), 1), + "throughput_per_minute": len([t for t in audio_tasks if current_time - t["timestamp"] < 60]) + } + + # Similar calculations for other stages + for stage in ["transcription", "memory", "cropping"]: + queue_attr = f"{stage}_queue" + queue = getattr(self, queue_attr, None) + + pipeline_stats[stage] = { + "queue_size": queue.qsize() if queue else 0, + "active_tasks": len([tid for tid, tinfo in self.processing_tasks.items() + if stage in tid and not self.task_manager.get_task_info(tinfo.get(stage, "")).completed_at]), + "avg_processing_time_ms": 30000, # Placeholder - can be calculated from task manager history + "success_rate": 0.95, # Placeholder - can be calculated from completed tasks + "throughput_per_minute": 5 # Placeholder + } + + return pipeline_stats + + def get_processing_history(self, limit: int = 50) -> list[dict[str, Any]]: + """Get recent processing history from task manager.""" + history = [] + + try: + # Get completed tasks from task manager (get the last N items) + completed_tasks = self.task_manager.completed_tasks[-limit:] if self.task_manager.completed_tasks else [] + + for task_info in completed_tasks: + task_type = task_info.metadata.get("type", "unknown") + if task_type in ["memory", "cropping", "transcription_chunk"]: + history.append({ + "client_id": task_info.metadata.get("client_id", "unknown"), + "conversation_id": task_info.metadata.get("conversation_id"), + "task_type": task_type, + "started_at": datetime.fromtimestamp(task_info.created_at, UTC).isoformat(), + "completed_at": datetime.fromtimestamp(task_info.completed_at, UTC).isoformat() if task_info.completed_at else None, + "duration_ms": (task_info.completed_at - task_info.created_at) * 1000 if task_info.completed_at else None, + "status": "completed" if task_info.completed_at and not task_info.error else "failed", + "error": task_info.error + }) + + return sorted(history, key=lambda x: x["started_at"], reverse=True) + except Exception as e: + logger.error(f"Error getting processing history: {e}") + return [] + + def get_queue_health_status(self) -> dict[str, str]: + """Determine queue health based on depth and processing rates.""" + health_status = {} + + queue_sizes = { + "audio": self.audio_queue.qsize(), + "transcription": self.transcription_queue.qsize(), + "memory": self.memory_queue.qsize(), + "cropping": self.cropping_queue.qsize() + } + + for queue_name, size in queue_sizes.items(): + if size == 0: + health_status[queue_name] = "idle" + elif size < 5: + health_status[queue_name] = "healthy" + elif size < 20: + health_status[queue_name] = "busy" + else: + health_status[queue_name] = "overloaded" + + return health_status + async def mark_transcription_failed(self, client_id: str, error: str): """Mark transcription as failed and clean up transcription manager. diff --git a/backends/advanced/src/advanced_omi_backend/routers/modules/system_routes.py b/backends/advanced/src/advanced_omi_backend/routers/modules/system_routes.py index 5e5d34d6..494db6ce 100644 --- a/backends/advanced/src/advanced_omi_backend/routers/modules/system_routes.py +++ b/backends/advanced/src/advanced_omi_backend/routers/modules/system_routes.py @@ -166,3 +166,26 @@ async def reload_memory_config(current_user: User = Depends(current_superuser)): async def delete_all_user_memories(current_user: User = Depends(current_active_user)): """Delete all memories for the current user.""" return await system_controller.delete_all_user_memories(current_user) + + +@router.get("/processor/overview") +async def get_processor_overview_route(current_user: User = Depends(current_superuser)): + """Get comprehensive processor overview with pipeline stats. Admin only.""" + return await system_controller.get_processor_overview() + +@router.get("/processor/history") +async def get_processor_history_route( + page: int = Query(1, ge=1, description="Page number"), + per_page: int = Query(50, ge=1, le=100, description="Items per page"), + current_user: User = Depends(current_superuser) +): + """Get paginated processing history. Admin only.""" + return await system_controller.get_processor_history(page, per_page) + +@router.get("/processor/clients/{client_id}") +async def get_client_processing_detail_route( + client_id: str, + current_user: User = Depends(current_superuser) +): + """Get detailed processing information for specific client. Admin only.""" + return await system_controller.get_client_processing_detail(client_id) diff --git a/backends/advanced/webui/src/App.tsx b/backends/advanced/webui/src/App.tsx index 16b723a8..1be7de6b 100644 --- a/backends/advanced/webui/src/App.tsx +++ b/backends/advanced/webui/src/App.tsx @@ -10,6 +10,7 @@ import Users from './pages/Users' import System from './pages/System' import Upload from './pages/Upload' import LiveRecord from './pages/LiveRecord' +import Processes from './pages/Processes' import ProtectedRoute from './components/auth/ProtectedRoute' import { ErrorBoundary, PageErrorBoundary } from './components/ErrorBoundary' @@ -68,6 +69,11 @@ function App() { } /> + + + + } /> diff --git a/backends/advanced/webui/src/components/layout/Layout.tsx b/backends/advanced/webui/src/components/layout/Layout.tsx index 13f2fa13..182b4e82 100644 --- a/backends/advanced/webui/src/components/layout/Layout.tsx +++ b/backends/advanced/webui/src/components/layout/Layout.tsx @@ -1,5 +1,5 @@ import { Link, useLocation, Outlet } from 'react-router-dom' -import { Music, MessageSquare, MessageCircle, Brain, Users, Upload, Settings, LogOut, Sun, Moon, Shield, Radio } from 'lucide-react' +import { Music, MessageSquare, MessageCircle, Brain, Users, Upload, Settings, LogOut, Sun, Moon, Shield, Radio, Activity } from 'lucide-react' import { useAuth } from '../../contexts/AuthContext' import { useTheme } from '../../contexts/ThemeContext' @@ -16,6 +16,7 @@ export default function Layout() { { path: '/users', label: 'User Management', icon: Users }, ...(isAdmin ? [ { path: '/upload', label: 'Upload Audio', icon: Upload }, + { path: '/processes', label: 'Processes', icon: Activity }, { path: '/system', label: 'System State', icon: Settings }, ] : []), ] diff --git a/backends/advanced/webui/src/components/processes/ActiveTasksTable.tsx b/backends/advanced/webui/src/components/processes/ActiveTasksTable.tsx new file mode 100644 index 00000000..8fb37e0b --- /dev/null +++ b/backends/advanced/webui/src/components/processes/ActiveTasksTable.tsx @@ -0,0 +1,255 @@ +import { useState, useEffect } from 'react' +import { Users, ExternalLink, ArrowUpDown, Search, RefreshCw } from 'lucide-react' +import { systemApi } from '../../services/api' + +interface ProcessingTask { + client_id: string + user_id: string + stages: Record +} + +interface ActiveTasksTableProps { + onClientSelect: (clientId: string) => void + refreshTrigger?: Date | null +} + +export default function ActiveTasksTable({ onClientSelect, refreshTrigger }: ActiveTasksTableProps) { + const [tasks, setTasks] = useState([]) + const [loading, setLoading] = useState(false) + const [error, setError] = useState(null) + const [searchTerm, setSearchTerm] = useState('') + const [sortField, setSortField] = useState<'client_id' | 'user_id' | 'stage_count'>('client_id') + const [sortDirection, setSortDirection] = useState<'asc' | 'desc'>('asc') + + const loadActiveTasks = async () => { + try { + setLoading(true) + setError(null) + const response = await systemApi.getProcessorTasks() + + // Convert the response to our expected format + const taskList = Object.entries(response.data).map(([clientId, taskData]: [string, any]) => ({ + client_id: clientId, + user_id: taskData.user_id || 'Unknown', + stages: taskData.stages || {} + })) + + setTasks(taskList) + } catch (err: any) { + setError(err.message || 'Failed to load active tasks') + } finally { + setLoading(false) + } + } + + useEffect(() => { + loadActiveTasks() + }, [refreshTrigger]) + + const handleSort = (field: typeof sortField) => { + if (sortField === field) { + setSortDirection(sortDirection === 'asc' ? 'desc' : 'asc') + } else { + setSortField(field) + setSortDirection('asc') + } + } + + const getStageCount = (stages: Record) => { + return Object.keys(stages).length + } + + const getActiveStage = (stages: Record) => { + // Find the most recent active stage + const stageNames = ['audio', 'transcription', 'memory', 'cropping'] + for (const stageName of stageNames) { + const stage = stages[stageName] + if (stage && stage.status === 'started' && !stage.completed) { + return stageName + } + } + return 'idle' + } + + const getStageDisplay = (stageName: string) => { + const stageColors = { + audio: 'bg-blue-100 text-blue-800 dark:bg-blue-900/40 dark:text-blue-300', + transcription: 'bg-green-100 text-green-800 dark:bg-green-900/40 dark:text-green-300', + memory: 'bg-purple-100 text-purple-800 dark:bg-purple-900/40 dark:text-purple-300', + cropping: 'bg-orange-100 text-orange-800 dark:bg-orange-900/40 dark:text-orange-300', + idle: 'bg-gray-100 text-gray-800 dark:bg-gray-900/40 dark:text-gray-300' + } + + const color = stageColors[stageName as keyof typeof stageColors] || stageColors.idle + + return ( + + {stageName.charAt(0).toUpperCase() + stageName.slice(1)} + + ) + } + + // Filter and sort tasks + const filteredTasks = tasks.filter(task => + task.client_id.toLowerCase().includes(searchTerm.toLowerCase()) || + task.user_id.toLowerCase().includes(searchTerm.toLowerCase()) + ) + + const sortedTasks = [...filteredTasks].sort((a, b) => { + let aValue: any, bValue: any + + switch (sortField) { + case 'stage_count': + aValue = getStageCount(a.stages) + bValue = getStageCount(b.stages) + break + case 'user_id': + aValue = a.user_id + bValue = b.user_id + break + default: + aValue = a.client_id + bValue = b.client_id + } + + if (sortDirection === 'asc') { + return aValue > bValue ? 1 : -1 + } else { + return aValue < bValue ? 1 : -1 + } + }) + + return ( +
+
+
+ +

+ Active Tasks ({sortedTasks.length}) +

+
+ +
+ + {/* Search */} +
+
+ + setSearchTerm(e.target.value)} + className="w-full pl-10 pr-4 py-2 border border-gray-300 dark:border-gray-600 rounded-md bg-white dark:bg-gray-700 text-gray-900 dark:text-gray-100 focus:ring-2 focus:ring-blue-500 focus:border-blue-500" + /> +
+
+ + {/* Error Display */} + {error && ( +
+

{error}

+
+ )} + + {/* Table */} +
+ + + + + + + + + + + + {loading ? ( + + + + ) : sortedTasks.length === 0 ? ( + + + + ) : ( + sortedTasks.map((task) => ( + + + + + + + + )) + )} + +
+ + + + Current Stage + + Actions
+ + Loading tasks... +
+ {tasks.length === 0 ? 'No active tasks' : 'No tasks match your search'} +
+ + {task.client_id} + + + {task.user_id} + + {getStageDisplay(getActiveStage(task.stages))} + + {getStageCount(task.stages)} + + +
+
+
+ ) +} \ No newline at end of file diff --git a/backends/advanced/webui/src/components/processes/ClientDetailModal.tsx b/backends/advanced/webui/src/components/processes/ClientDetailModal.tsx new file mode 100644 index 00000000..d5ac193e --- /dev/null +++ b/backends/advanced/webui/src/components/processes/ClientDetailModal.tsx @@ -0,0 +1,300 @@ +import { useState, useEffect } from 'react' +import { X, User, Activity, Clock, CheckCircle, XCircle, RefreshCw, AlertTriangle } from 'lucide-react' +import { systemApi } from '../../services/api' + +interface ClientProcessingDetail { + client_id: string + client_info: { + user_id: string + user_email: string + current_audio_uuid?: string + conversation_start_time?: string + sample_rate?: number + } + processing_status: { + stages: Record + } + active_tasks: Array<{ + task_id: string + task_name: string + task_type: string + created_at: string + completed_at?: string + error?: string + cancelled: boolean + }> +} + +interface ClientDetailModalProps { + clientId: string + onClose: () => void +} + +export default function ClientDetailModal({ clientId, onClose }: ClientDetailModalProps) { + const [clientDetail, setClientDetail] = useState(null) + const [loading, setLoading] = useState(false) + const [error, setError] = useState(null) + + const loadClientDetail = async () => { + try { + setLoading(true) + setError(null) + const response = await systemApi.getClientProcessingDetail(clientId) + setClientDetail(response.data) + } catch (err: any) { + setError(err.message || 'Failed to load client details') + } finally { + setLoading(false) + } + } + + useEffect(() => { + loadClientDetail() + }, [clientId]) + + const formatTime = (timestamp: string) => { + return new Date(timestamp).toLocaleString() + } + + const getStageIcon = (status: string, completed?: boolean, error?: string) => { + if (error) return + if (completed) return + if (status === 'started') return + return + } + + const getStageStatus = (status: string, completed?: boolean, error?: string) => { + if (error) return 'Failed' + if (completed) return 'Completed' + if (status === 'started') return 'Processing' + return 'Pending' + } + + const getStageColor = (status: string, completed?: boolean, error?: string) => { + if (error) return 'border-red-200 bg-red-50 dark:border-red-800 dark:bg-red-900/20' + if (completed) return 'border-green-200 bg-green-50 dark:border-green-800 dark:bg-green-900/20' + if (status === 'started') return 'border-blue-200 bg-blue-50 dark:border-blue-800 dark:bg-blue-900/20' + return 'border-gray-200 bg-gray-50 dark:border-gray-700 dark:bg-gray-800' + } + + const getTaskStatusIcon = (task: ClientProcessingDetail['active_tasks'][0]) => { + if (task.cancelled) return + if (task.error) return + if (task.completed_at) return + return + } + + return ( +
+
+ {/* Header */} +
+
+ +

+ Client Details +

+ + {clientId} + +
+
+ + +
+
+ + {/* Content */} +
+ {loading && !clientDetail && ( +
+ + Loading client details... +
+ )} + + {error && ( +
+
+ +

{error}

+
+
+ )} + + {clientDetail && ( +
+ {/* Client Information */} +
+

+ Client Information +

+
+
+ +

{clientDetail.client_info.user_id}

+
+
+ +

{clientDetail.client_info.user_email}

+
+
+ +

+ {clientDetail.client_info.current_audio_uuid ? ( + + {clientDetail.client_info.current_audio_uuid} + + ) : ( + 'None' + )} +

+
+
+ +

+ {clientDetail.client_info.sample_rate ? `${clientDetail.client_info.sample_rate} Hz` : 'N/A'} +

+
+ {clientDetail.client_info.conversation_start_time && ( +
+ +

+ {formatTime(clientDetail.client_info.conversation_start_time)} +

+
+ )} +
+
+ + {/* Processing Stages */} +
+

+ Processing Stages +

+
+ {Object.entries(clientDetail.processing_status.stages || {}).map(([stageName, stage]) => ( +
+
+
+ {getStageIcon(stage.status, stage.completed, stage.error)} +

+ {stageName} +

+
+ + {getStageStatus(stage.status, stage.completed, stage.error)} + +
+ {stage.timestamp && ( +

+ {formatTime(stage.timestamp)} +

+ )} + {stage.error && ( +

+ {stage.error} +

+ )} + {stage.metadata && Object.keys(stage.metadata).length > 0 && ( +
+
+ + View Metadata + +
+                              {JSON.stringify(stage.metadata, null, 2)}
+                            
+
+
+ )} +
+ ))} +
+
+ + {/* Active Tasks */} +
+

+ Active Tasks ({clientDetail.active_tasks.length}) +

+ {clientDetail.active_tasks.length === 0 ? ( +

+ No active tasks +

+ ) : ( +
+ {clientDetail.active_tasks.map((task) => ( +
+
+
+ {getTaskStatusIcon(task)} +

+ {task.task_name} +

+ + {task.task_type} + +
+ + {task.task_id} + +
+
+
+ +

{formatTime(task.created_at)}

+
+ {task.completed_at && ( +
+ +

{formatTime(task.completed_at)}

+
+ )} +
+ {task.error && ( +
+

{task.error}

+
+ )} + {task.cancelled && ( +
+

Task was cancelled

+
+ )} +
+ ))} +
+ )} +
+
+ )} +
+
+
+ ) +} \ No newline at end of file diff --git a/backends/advanced/webui/src/components/processes/ProcessPipelineView.tsx b/backends/advanced/webui/src/components/processes/ProcessPipelineView.tsx new file mode 100644 index 00000000..eac3209e --- /dev/null +++ b/backends/advanced/webui/src/components/processes/ProcessPipelineView.tsx @@ -0,0 +1,206 @@ +import { ArrowRight, Volume2, FileText, Brain, Scissors, CheckCircle, AlertTriangle, Clock } from 'lucide-react' + +interface PipelineStageStats { + queue_size: number + active_tasks: number + avg_processing_time_ms: number + success_rate: number + throughput_per_minute: number +} + +interface ProcessPipelineViewProps { + pipelineStats: { + audio: PipelineStageStats + transcription: PipelineStageStats + memory: PipelineStageStats + cropping: PipelineStageStats + } + queueHealth: Record +} + +export default function ProcessPipelineView({ pipelineStats, queueHealth }: ProcessPipelineViewProps) { + const stages = [ + { + name: 'Audio', + icon: Volume2, + key: 'audio' as keyof typeof pipelineStats, + color: 'blue', + description: 'Audio chunk processing' + }, + { + name: 'Transcription', + icon: FileText, + key: 'transcription' as keyof typeof pipelineStats, + color: 'green', + description: 'Speech-to-text conversion' + }, + { + name: 'Memory', + icon: Brain, + key: 'memory' as keyof typeof pipelineStats, + color: 'purple', + description: 'Memory extraction' + }, + { + name: 'Cropping', + icon: Scissors, + key: 'cropping' as keyof typeof pipelineStats, + color: 'orange', + description: 'Audio file optimization' + } + ] + + const getHealthIcon = (health: string) => { + switch (health) { + case 'healthy': + return + case 'busy': + return + case 'overloaded': + return + default: + return + } + } + + const getHealthColor = (health: string) => { + switch (health) { + case 'healthy': return 'border-green-200 bg-green-50 dark:border-green-800 dark:bg-green-900/20' + case 'busy': return 'border-yellow-200 bg-yellow-50 dark:border-yellow-800 dark:bg-yellow-900/20' + case 'overloaded': return 'border-red-200 bg-red-50 dark:border-red-800 dark:bg-red-900/20' + default: return 'border-gray-200 bg-gray-50 dark:border-gray-700 dark:bg-gray-800/20' + } + } + + const getStageColor = (color: string) => { + const colors = { + blue: 'text-blue-600 bg-blue-100 dark:bg-blue-900/20', + green: 'text-green-600 bg-green-100 dark:bg-green-900/20', + purple: 'text-purple-600 bg-purple-100 dark:bg-purple-900/20', + orange: 'text-orange-600 bg-orange-100 dark:bg-orange-900/20' + } + return colors[color as keyof typeof colors] || colors.blue + } + + return ( +
+

+ Processing Pipeline +

+ + {/* Pipeline Stages */} +
+ {stages.map((stage, index) => { + const stats = pipelineStats[stage.key] + const health = queueHealth[stage.key] || 'idle' + const Icon = stage.icon + + return ( +
+ {/* Stage Card */} +
+ {/* Stage Header */} +
+
+
+ +
+
+

+ {stage.name} +

+

+ {stage.description} +

+
+
+ {getHealthIcon(health)} +
+ + {/* Stage Stats */} +
+
+ Queue + + {stats.queue_size} + +
+
+ Active + + {stats.active_tasks} + +
+
+ Avg Time + + {stats.avg_processing_time_ms < 1000 + ? `${Math.round(stats.avg_processing_time_ms)}ms` + : `${(stats.avg_processing_time_ms / 1000).toFixed(1)}s` + } + +
+
+ Success + + {(stats.success_rate * 100).toFixed(0)}% + +
+
+ + {/* Health Status */} +
+ + {health.charAt(0).toUpperCase() + health.slice(1)} + +
+
+ + {/* Arrow (except for last stage) */} + {index < stages.length - 1 && ( +
+ +
+ )} +
+ ) + })} +
+ + {/* Pipeline Summary */} +
+
+
+
+ {Object.values(pipelineStats).reduce((sum, stage) => sum + stage.queue_size, 0)} +
+
Total Queued
+
+
+
+ {Object.values(pipelineStats).reduce((sum, stage) => sum + stage.active_tasks, 0)} +
+
Total Active
+
+
+
+ {Math.round(Object.values(pipelineStats).reduce((sum, stage) => sum + stage.success_rate, 0) / Object.keys(pipelineStats).length * 100)}% +
+
Avg Success Rate
+
+
+
+ {Object.values(pipelineStats).reduce((sum, stage) => sum + stage.throughput_per_minute, 0)} +
+
Total Throughput/min
+
+
+
+
+ ) +} \ No newline at end of file diff --git a/backends/advanced/webui/src/components/processes/ProcessingHistory.tsx b/backends/advanced/webui/src/components/processes/ProcessingHistory.tsx new file mode 100644 index 00000000..0d73d1c1 --- /dev/null +++ b/backends/advanced/webui/src/components/processes/ProcessingHistory.tsx @@ -0,0 +1,213 @@ +import { useState, useEffect } from 'react' +import { Clock, CheckCircle, XCircle, ChevronLeft, ChevronRight, RefreshCw, BarChart3 } from 'lucide-react' +import { systemApi } from '../../services/api' + +interface ProcessingHistoryItem { + client_id: string + conversation_id?: string + task_type: string + started_at: string + completed_at?: string + duration_ms?: number + status: string + error?: string +} + +interface ProcessingHistoryProps { + initialData?: ProcessingHistoryItem[] + refreshTrigger?: Date | null +} + +export default function ProcessingHistory({ initialData = [], refreshTrigger }: ProcessingHistoryProps) { + const [history, setHistory] = useState(initialData) + const [loading, setLoading] = useState(false) + const [error, setError] = useState(null) + const [currentPage, setCurrentPage] = useState(1) + const [totalPages, setTotalPages] = useState(1) + const [perPage] = useState(10) + + const loadHistory = async (page: number = currentPage) => { + try { + setLoading(true) + setError(null) + const response = await systemApi.getProcessorHistory(page, perPage) + + setHistory(response.data.history) + setCurrentPage(response.data.pagination.page) + setTotalPages(response.data.pagination.total_pages) + } catch (err: any) { + setError(err.message || 'Failed to load processing history') + } finally { + setLoading(false) + } + } + + useEffect(() => { + if (refreshTrigger) { + loadHistory(1) // Refresh from first page + } + }, [refreshTrigger]) + + useEffect(() => { + if (initialData.length === 0) { + loadHistory(1) + } + }, []) + + const formatDuration = (durationMs?: number) => { + if (!durationMs) return 'N/A' + if (durationMs < 1000) return `${Math.round(durationMs)}ms` + if (durationMs < 60000) return `${(durationMs / 1000).toFixed(1)}s` + return `${(durationMs / 60000).toFixed(1)}m` + } + + const formatTime = (timestamp: string) => { + return new Date(timestamp).toLocaleTimeString() + } + + const getStatusIcon = (status: string) => { + switch (status) { + case 'completed': + return + case 'failed': + return + default: + return + } + } + + const getStatusColor = (status: string) => { + switch (status) { + case 'completed': + return 'bg-green-100 text-green-800 dark:bg-green-900/40 dark:text-green-300' + case 'failed': + return 'bg-red-100 text-red-800 dark:bg-red-900/40 dark:text-red-300' + default: + return 'bg-yellow-100 text-yellow-800 dark:bg-yellow-900/40 dark:text-yellow-300' + } + } + + const getTaskTypeColor = (taskType: string) => { + const colors = { + memory: 'bg-purple-100 text-purple-800 dark:bg-purple-900/40 dark:text-purple-300', + transcription_chunk: 'bg-green-100 text-green-800 dark:bg-green-900/40 dark:text-green-300', + cropping: 'bg-orange-100 text-orange-800 dark:bg-orange-900/40 dark:text-orange-300' + } + return colors[taskType as keyof typeof colors] || 'bg-gray-100 text-gray-800 dark:bg-gray-900/40 dark:text-gray-300' + } + + const handlePageChange = (newPage: number) => { + if (newPage >= 1 && newPage <= totalPages) { + loadHistory(newPage) + } + } + + return ( +
+
+
+ +

+ Processing History +

+
+ +
+ + {/* Error Display */} + {error && ( +
+

{error}

+
+ )} + + {/* History List */} +
+ {loading ? ( +
+ + Loading history... +
+ ) : history.length === 0 ? ( +
+ No processing history available +
+ ) : ( + history.map((item, index) => ( +
+
+ {getStatusIcon(item.status)} +
+
+ + {item.task_type.replace('_', ' ')} + + + {item.status} + +
+
+ Client: {item.client_id} + {item.conversation_id && ( + + Conv: {item.conversation_id} + + )} +
+ {item.error && ( +
+ Error: {item.error} +
+ )} +
+
+
+
{formatTime(item.started_at)}
+
+ {formatDuration(item.duration_ms)} +
+
+
+ )) + )} +
+ + {/* Pagination */} + {totalPages > 1 && ( +
+
+ Page {currentPage} of {totalPages} +
+
+ + +
+
+ )} +
+ ) +} \ No newline at end of file diff --git a/backends/advanced/webui/src/components/processes/SystemHealthCards.tsx b/backends/advanced/webui/src/components/processes/SystemHealthCards.tsx new file mode 100644 index 00000000..9e88ea31 --- /dev/null +++ b/backends/advanced/webui/src/components/processes/SystemHealthCards.tsx @@ -0,0 +1,135 @@ +import { Users, Activity, AlertTriangle, CheckCircle, Clock } from 'lucide-react' + +interface SystemHealthData { + total_active_clients: number + total_processing_tasks: number + task_manager_healthy: boolean + error_rate: number + uptime_hours: number +} + +interface SystemHealthCardsProps { + data: SystemHealthData +} + +export default function SystemHealthCards({ data }: SystemHealthCardsProps) { + const cards = [ + { + title: 'Active Clients', + value: data.total_active_clients, + icon: Users, + color: 'blue', + description: 'Currently connected clients' + }, + { + title: 'Processing Tasks', + value: data.total_processing_tasks, + icon: Activity, + color: 'green', + description: 'Tasks in processing queues' + }, + { + title: 'Error Rate', + value: `${(data.error_rate * 100).toFixed(1)}%`, + icon: data.error_rate > 0.1 ? AlertTriangle : CheckCircle, + color: data.error_rate > 0.1 ? 'red' : 'green', + description: 'Recent processing error rate' + }, + { + title: 'Uptime', + value: `${Math.floor(data.uptime_hours)}h`, + icon: Clock, + color: 'purple', + description: 'System uptime' + } + ] + + const getCardColors = (color: string) => { + const colors = { + blue: { + bg: 'bg-blue-50 dark:bg-blue-900/20', + border: 'border-blue-200 dark:border-blue-800', + icon: 'text-blue-600 bg-blue-100 dark:bg-blue-900/40 dark:text-blue-400', + text: 'text-blue-900 dark:text-blue-100' + }, + green: { + bg: 'bg-green-50 dark:bg-green-900/20', + border: 'border-green-200 dark:border-green-800', + icon: 'text-green-600 bg-green-100 dark:bg-green-900/40 dark:text-green-400', + text: 'text-green-900 dark:text-green-100' + }, + red: { + bg: 'bg-red-50 dark:bg-red-900/20', + border: 'border-red-200 dark:border-red-800', + icon: 'text-red-600 bg-red-100 dark:bg-red-900/40 dark:text-red-400', + text: 'text-red-900 dark:text-red-100' + }, + purple: { + bg: 'bg-purple-50 dark:bg-purple-900/20', + border: 'border-purple-200 dark:border-purple-800', + icon: 'text-purple-600 bg-purple-100 dark:bg-purple-900/40 dark:text-purple-400', + text: 'text-purple-900 dark:text-purple-100' + } + } + return colors[color as keyof typeof colors] || colors.blue + } + + return ( +
+ {cards.map((card) => { + const Icon = card.icon + const colors = getCardColors(card.color) + + return ( +
+
+
+

+ {card.title} +

+

+ {card.value} +

+

+ {card.description} +

+
+
+ +
+
+ + {/* Health Indicator for Task Manager */} + {card.title === 'Processing Tasks' && ( +
+
+ + Task Manager: {data.task_manager_healthy ? 'Healthy' : 'Unhealthy'} + +
+ )} + + {/* Error Rate Trend */} + {card.title === 'Error Rate' && ( +
+
+
0.1 ? 'bg-red-500' : 'bg-green-500' + }`} + style={{ width: `${Math.min(data.error_rate * 100, 100)}%` }} + /> +
+
+ )} +
+ ) + })} +
+ ) +} \ No newline at end of file diff --git a/backends/advanced/webui/src/pages/Processes.tsx b/backends/advanced/webui/src/pages/Processes.tsx new file mode 100644 index 00000000..0eaf050f --- /dev/null +++ b/backends/advanced/webui/src/pages/Processes.tsx @@ -0,0 +1,214 @@ +import { useState, useEffect } from 'react' +import { Activity, RefreshCw, Users, Clock, BarChart3 } from 'lucide-react' +import { systemApi } from '../services/api' +import { useAuth } from '../contexts/AuthContext' +import ProcessPipelineView from '../components/processes/ProcessPipelineView' +import SystemHealthCards from '../components/processes/SystemHealthCards' +import ActiveTasksTable from '../components/processes/ActiveTasksTable' +import ProcessingHistory from '../components/processes/ProcessingHistory' +import ClientDetailModal from '../components/processes/ClientDetailModal' + +interface ProcessorOverview { + pipeline_stats: { + audio: PipelineStageStats + transcription: PipelineStageStats + memory: PipelineStageStats + cropping: PipelineStageStats + } + system_health: { + total_active_clients: number + total_processing_tasks: number + task_manager_healthy: boolean + error_rate: number + uptime_hours: number + } + queue_health: Record + recent_activity: ProcessingHistoryItem[] +} + +interface PipelineStageStats { + queue_size: number + active_tasks: number + avg_processing_time_ms: number + success_rate: number + throughput_per_minute: number +} + +interface ProcessingHistoryItem { + client_id: string + conversation_id?: string + task_type: string + started_at: string + completed_at?: string + duration_ms?: number + status: string + error?: string +} + +interface ClientProcessingDetail { + client_id: string + client_info: { + user_id: string + user_email: string + current_audio_uuid?: string + conversation_start_time?: string + sample_rate?: number + } + processing_status: any + active_tasks: Array<{ + task_id: string + task_name: string + task_type: string + created_at: string + completed_at?: string + error?: string + cancelled: boolean + }> +} + +export default function Processes() { + const [overviewData, setOverviewData] = useState(null) + const [loading, setLoading] = useState(false) + const [error, setError] = useState(null) + const [lastUpdated, setLastUpdated] = useState(null) + const [selectedClientId, setSelectedClientId] = useState(null) + const [autoRefresh, setAutoRefresh] = useState(true) + + const { isAdmin } = useAuth() + + const loadProcessorOverview = async () => { + if (!isAdmin) return + + try { + setLoading(true) + setError(null) + + const response = await systemApi.getProcessorOverview() + setOverviewData(response.data) + setLastUpdated(new Date()) + } catch (err: any) { + setError(err.message || 'Failed to load processor overview') + } finally { + setLoading(false) + } + } + + // Auto-refresh effect + useEffect(() => { + if (!autoRefresh) return + + const interval = setInterval(() => { + loadProcessorOverview() + }, 5000) // Refresh every 5 seconds + + return () => clearInterval(interval) + }, [autoRefresh, isAdmin]) + + // Initial load + useEffect(() => { + loadProcessorOverview() + }, [isAdmin]) + + if (!isAdmin) { + return ( +
+ +

+ Access Restricted +

+

+ You need administrator privileges to view process monitoring. +

+
+ ) + } + + return ( +
+ {/* Header */} +
+
+ +

+ Process Monitoring +

+
+
+ {lastUpdated && ( + + Last updated: {lastUpdated.toLocaleTimeString()} + + )} + + {/* Auto-refresh toggle */} + + + +
+
+ + {/* Error Message */} + {error && ( +
+

{error}

+
+ )} + + {overviewData && ( +
+ {/* System Health Overview */} + + + {/* Processing Pipeline View */} + + + {/* Active Tasks and History */} +
+ + +
+
+ )} + + {/* Loading State */} + {loading && !overviewData && ( +
+ + Loading process data... +
+ )} + + {/* Client Detail Modal */} + {selectedClientId && ( + setSelectedClientId(null)} + /> + )} +
+ ) +} \ No newline at end of file diff --git a/backends/advanced/webui/src/pages/System.tsx b/backends/advanced/webui/src/pages/System.tsx index 8a7e5e0e..c1283660 100644 --- a/backends/advanced/webui/src/pages/System.tsx +++ b/backends/advanced/webui/src/pages/System.tsx @@ -1,5 +1,5 @@ import { useState, useEffect } from 'react' -import { Settings, RefreshCw, CheckCircle, XCircle, AlertCircle, Activity, Users, Database, Server, Volume2, Mic } from 'lucide-react' +import { Settings, RefreshCw, CheckCircle, XCircle, AlertCircle, Activity, Users, Database, Volume2, Mic } from 'lucide-react' import { systemApi, speakerApi } from '../services/api' import { useAuth } from '../contexts/AuthContext' import MemorySettings from '../components/MemorySettings' @@ -21,20 +21,6 @@ interface MetricsData { } } -interface ProcessorStatus { - audio_queue_size: number - transcription_queue_size: number - memory_queue_size: number - active_tasks: number -} - -interface ActiveClient { - id: string - user_id: string - connected_at: string - last_activity: string -} - interface DiarizationSettings { diarization_source: 'deepgram' | 'pyannote' similarity_threshold: number @@ -49,8 +35,6 @@ export default function System() { const [healthData, setHealthData] = useState(null) const [readinessData, setReadinessData] = useState(null) const [metricsData, setMetricsData] = useState(null) - const [processorStatus, setProcessorStatus] = useState(null) - const [activeClients, setActiveClients] = useState([]) const [loading, setLoading] = useState(false) const [error, setError] = useState(null) const [lastUpdated, setLastUpdated] = useState(null) @@ -74,12 +58,10 @@ export default function System() { setLoading(true) setError(null) - const [health, readiness, metrics, processor, clients] = await Promise.allSettled([ + const [health, readiness, metrics] = await Promise.allSettled([ systemApi.getHealth(), systemApi.getReadiness(), systemApi.getMetrics().catch(() => ({ data: null })), // Optional endpoint - systemApi.getProcessorStatus().catch(() => ({ data: null })), // Optional endpoint - systemApi.getActiveClients().catch(() => ({ data: [] })), // Optional endpoint ]) if (health.status === 'fulfilled') { @@ -91,12 +73,6 @@ export default function System() { if (metrics.status === 'fulfilled' && metrics.value.data) { setMetricsData(metrics.value.data) } - if (processor.status === 'fulfilled' && processor.value.data) { - setProcessorStatus(processor.value.data) - } - if (clients.status === 'fulfilled' && clients.value.data) { - setActiveClients(clients.value.data) - } setLastUpdated(new Date()) } catch (err: any) { @@ -282,41 +258,6 @@ export default function System() {
)} - {/* Processor Status */} - {processorStatus && ( -
-

- - Processor Status -

-
-
-
Audio Queue
-
- {processorStatus.audio_queue_size} -
-
-
-
Transcription Queue
-
- {processorStatus.transcription_queue_size} -
-
-
-
Memory Queue
-
- {processorStatus.memory_queue_size} -
-
-
-
Active Tasks
-
- {processorStatus.active_tasks} -
-
-
-
- )} {/* Diarization Settings */}
@@ -538,39 +479,6 @@ export default function System() { {/* Speaker Configuration */} - {/* Active Clients */} -
-

- - Active Clients ({activeClients.length}) -

- {activeClients.length > 0 ? ( -
- {activeClients.map((client) => ( -
-
-
{client.id}
-
- User: {client.user_id} -
-
-
-
- Connected: {formatDate(client.connected_at)} -
-
- Last: {formatDate(client.last_activity)} -
-
-
- ))} -
- ) : ( -

- No active clients -

- )} -
{/* Debug Metrics */} {metricsData?.debug_tracker && ( diff --git a/backends/advanced/webui/src/services/api.ts b/backends/advanced/webui/src/services/api.ts index 5c9d82f0..9da281e6 100644 --- a/backends/advanced/webui/src/services/api.ts +++ b/backends/advanced/webui/src/services/api.ts @@ -131,6 +131,13 @@ export const systemApi = { headers: { 'Content-Type': 'text/plain' } }), reloadMemoryConfig: () => api.post('/api/admin/memory/config/reload'), + + // Processing overview and detailed monitoring + getProcessorOverview: () => api.get('/api/processor/overview'), + getProcessorHistory: (page = 1, perPage = 50) => + api.get('/api/processor/history', { params: { page, per_page: perPage } }), + getClientProcessingDetail: (clientId: string) => + api.get(`/api/processor/clients/${clientId}`), } export const uploadApi = { From 20cfe81bf473f72c0a89b725b02aa59c37c21470 Mon Sep 17 00:00:00 2001 From: Ankush Malaker <43288948+AnkushMalaker@users.noreply.github.com> Date: Fri, 19 Sep 2025 20:47:00 +0000 Subject: [PATCH 2/7] Refactor audio processing and enhance upload functionality - Updated the `CLAUDE.md` documentation to reflect changes in ASR service command. - Introduced a new method `load_audio_file_as_chunk` in `audio_utils.py` for loading audio files into the Wyoming AudioChunk format. - Enhanced `ProcessorManager` to include client type detection and improved cleanup of processing tasks. - Updated `conversation_controller.py` to queue transcription and memory processing jobs with better error handling. - Refactored the `Upload` component in the web UI to support a three-phase upload process with improved status management and polling for processing tasks. - Added new API methods for asynchronous file uploads and job status retrieval. --- CLAUDE.md | 15 +- .../src/advanced_omi_backend/audio_utils.py | 67 +++ .../controllers/conversation_controller.py | 72 +++- .../controllers/system_controller.py | 44 +- .../advanced/src/advanced_omi_backend/main.py | 10 + .../memory/memory_service.py | 8 + .../src/advanced_omi_backend/processors.py | 182 +++++++- .../routers/modules/system_routes.py | 2 + backends/advanced/webui/src/pages/Upload.tsx | 400 ++++++++++++++++-- backends/advanced/webui/src/services/api.ts | 27 +- extras/speaker-recognition/sortformer.py | 350 +++++++++++++++ 11 files changed, 1104 insertions(+), 73 deletions(-) create mode 100644 extras/speaker-recognition/sortformer.py diff --git a/CLAUDE.md b/CLAUDE.md index 8ee8193c..aadafd4c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -122,7 +122,7 @@ npm run web ```bash # ASR Services cd extras/asr-services -docker compose up parakeet # Offline ASR with Parakeet +docker compose up parakeet-asr # Offline ASR with Parakeet # Speaker Recognition (with tests) cd extras/speaker-recognition @@ -136,13 +136,6 @@ docker compose up --build ## Architecture Overview -### Core Structure -- **backends/advanced-backend/**: Primary FastAPI backend with real-time audio processing - - `src/main.py`: Central FastAPI application with WebSocket audio streaming - - `src/auth.py`: Email-based authentication with JWT tokens - - `src/memory/`: LLM-powered conversation memory system using mem0 - - `webui/`: React-based web dashboard for conversation and user management - ### Key Components - **Audio Pipeline**: Real-time Opus/PCM → Application-level processing → Deepgram/Mistral transcription → memory extraction - **Wyoming Protocol**: WebSocket communication uses Wyoming protocol (JSONL + binary) for structured audio sessions @@ -1214,12 +1207,6 @@ curl http://[gpu-machine-ip]:8085/health # Speaker recognition ### Troubleshooting Distributed Setup -**Common Issues:** -- **CORS errors**: Tailscale IPs are automatically supported, but verify CORS_ORIGINS if using custom IPs -- **Service discovery**: Use `tailscale ip` to find machine IPs -- **Port conflicts**: Ensure services use different ports on shared machines -- **Authentication**: Services must be accessible without authentication for inter-service communication - **Debugging Commands:** ```bash # Check Tailscale connectivity diff --git a/backends/advanced/src/advanced_omi_backend/audio_utils.py b/backends/advanced/src/advanced_omi_backend/audio_utils.py index 2821d126..1a3937c7 100644 --- a/backends/advanced/src/advanced_omi_backend/audio_utils.py +++ b/backends/advanced/src/advanced_omi_backend/audio_utils.py @@ -6,6 +6,10 @@ import logging import os import time +import wave +import io +import numpy as np +from pathlib import Path # Type import to avoid circular imports from typing import TYPE_CHECKING, Optional @@ -88,6 +92,69 @@ async def process_audio_chunk( client_state.update_audio_received(chunk) +async def load_audio_file_as_chunk(audio_path: Path) -> AudioChunk: + """Load existing audio file into Wyoming AudioChunk format for reprocessing. + + Args: + audio_path: Path to the audio file on disk + + Returns: + AudioChunk object ready for processing + + Raises: + FileNotFoundError: If audio file doesn't exist + ValueError: If audio file format is invalid + """ + try: + # Read the audio file + with open(audio_path, 'rb') as f: + file_content = f.read() + + # Process WAV file using existing pattern from system_controller.py + with wave.open(io.BytesIO(file_content), "rb") as wav_file: + sample_rate = wav_file.getframerate() + sample_width = wav_file.getsampwidth() + channels = wav_file.getnchannels() + audio_data = wav_file.readframes(wav_file.getnframes()) + + # Convert to mono if stereo (same logic as system_controller.py) + if channels == 2: + if sample_width == 2: + audio_array = np.frombuffer(audio_data, dtype=np.int16) + audio_array = audio_array.reshape(-1, 2) + audio_data = np.mean(audio_array, axis=1, dtype=np.int16).tobytes() + channels = 1 + else: + raise ValueError(f"Unsupported sample width for stereo: {sample_width}") + + # Validate format matches expected (16kHz, mono, 16-bit) + if sample_rate != 16000: + raise ValueError(f"Audio file has sample rate {sample_rate}Hz, expected 16kHz") + if channels != 1: + raise ValueError(f"Audio file has {channels} channels, expected mono") + if sample_width != 2: + raise ValueError(f"Audio file has {sample_width}-byte samples, expected 2 bytes") + + # Create AudioChunk with current timestamp + chunk = AudioChunk( + audio=audio_data, + rate=sample_rate, + width=sample_width, + channels=channels, + timestamp=int(time.time() * 1000) + ) + + logger.info(f"Loaded audio file {audio_path} as AudioChunk ({len(audio_data)} bytes)") + return chunk + + except FileNotFoundError: + logger.error(f"Audio file not found: {audio_path}") + raise + except Exception as e: + logger.error(f"Error loading audio file {audio_path}: {e}") + raise ValueError(f"Invalid audio file format: {e}") + + async def _process_audio_cropping_with_relative_timestamps( original_path: str, speech_segments: list[tuple[float, float]], diff --git a/backends/advanced/src/advanced_omi_backend/controllers/conversation_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/conversation_controller.py index e53eef88..3df2a281 100644 --- a/backends/advanced/src/advanced_omi_backend/controllers/conversation_controller.py +++ b/backends/advanced/src/advanced_omi_backend/controllers/conversation_controller.py @@ -5,12 +5,14 @@ import asyncio import hashlib import logging +import os import time from pathlib import Path from typing import Optional from advanced_omi_backend.audio_utils import ( _process_audio_cropping_with_relative_timestamps, + load_audio_file_as_chunk, ) from advanced_omi_backend.client_manager import ( ClientManager, @@ -18,7 +20,8 @@ get_user_clients_all, ) from advanced_omi_backend.database import AudioChunksRepository, ProcessingRunsRepository, chunks_col, processing_runs_col, conversations_col, ConversationsRepository -from advanced_omi_backend.users import User +from advanced_omi_backend.processors import get_processor_manager, TranscriptionItem, MemoryProcessingItem +from advanced_omi_backend.users import User, get_user_by_id from fastapi.responses import JSONResponse logger = logging.getLogger(__name__) @@ -585,9 +588,10 @@ async def reprocess_transcript(conversation_id: str, user: User): ) # Generate configuration hash for duplicate detection + transcription_provider = os.getenv("TRANSCRIPTION_PROVIDER", "deepgram") config_data = { "audio_path": str(full_audio_path), - "transcription_provider": "deepgram", # This would come from settings + "transcription_provider": transcription_provider, "trigger": "manual_reprocess" } config_hash = hashlib.sha256(str(config_data).encode()).hexdigest()[:16] @@ -613,18 +617,37 @@ async def reprocess_transcript(conversation_id: str, user: User): status_code=500, content={"error": "Failed to create transcript version"} ) - # TODO: Queue audio for reprocessing with ProcessorManager - # This is where we would integrate with the existing processor - # For now, we'll return the version ID for the caller to handle + # NEW: Load audio file and queue for transcription processing + try: + # Load audio file as AudioChunk + audio_chunk = await load_audio_file_as_chunk(full_audio_path) + + # Create TranscriptionItem for reprocessing + transcription_item = TranscriptionItem( + client_id=f"reprocess-{conversation_id}", + user_id=str(user.user_id), + audio_uuid=audio_uuid, + audio_chunk=audio_chunk + ) + + # Queue for transcription processing + processor_manager = get_processor_manager() + await processor_manager.queue_transcription(transcription_item) + + logger.info(f"Queued transcript reprocessing job {run_id} (version {version_id}) for conversation {conversation_id}") - logger.info(f"Created transcript reprocessing job {run_id} (version {version_id}) for conversation {conversation_id}") + except Exception as e: + logger.error(f"Error queuing transcript reprocessing: {e}") + return JSONResponse( + status_code=500, content={"error": f"Failed to queue reprocessing: {str(e)}"} + ) return JSONResponse(content={ "message": f"Transcript reprocessing started for conversation {conversation_id}", "run_id": run_id, "version_id": version_id, "config_hash": config_hash, - "status": "PENDING" + "status": "QUEUED" }) except Exception as e: @@ -673,9 +696,10 @@ async def reprocess_memory(conversation_id: str, transcript_version_id: str, use ) # Generate configuration hash for duplicate detection + memory_provider = os.getenv("MEMORY_PROVIDER", "friend_lite") config_data = { "transcript_version_id": transcript_version_id, - "memory_provider": "friend_lite", # This would come from settings + "memory_provider": memory_provider, "trigger": "manual_reprocess" } config_hash = hashlib.sha256(str(config_data).encode()).hexdigest()[:16] @@ -702,10 +726,34 @@ async def reprocess_memory(conversation_id: str, transcript_version_id: str, use status_code=500, content={"error": "Failed to create memory version"} ) - # TODO: Queue memory extraction for processing - # This is where we would integrate with the existing memory processor + # NEW: Queue memory processing + try: + # Get user email for memory processing + user_obj = await get_user_by_id(str(user.user_id)) + if not user_obj: + return JSONResponse( + status_code=500, content={"error": "User not found for memory processing"} + ) + + # Create MemoryProcessingItem for reprocessing + memory_item = MemoryProcessingItem( + client_id=f"reprocess-{conversation_id}", + user_id=str(user.user_id), + user_email=user_obj.email, + conversation_id=conversation_id + ) + + # Queue for memory processing + processor_manager = get_processor_manager() + await processor_manager.queue_memory(memory_item) - logger.info(f"Created memory reprocessing job {run_id} (version {version_id}) for conversation {conversation_id}") + logger.info(f"Queued memory reprocessing job {run_id} (version {version_id}) for conversation {conversation_id}") + + except Exception as e: + logger.error(f"Error queuing memory reprocessing: {e}") + return JSONResponse( + status_code=500, content={"error": f"Failed to queue memory reprocessing: {str(e)}"} + ) return JSONResponse(content={ "message": f"Memory reprocessing started for conversation {conversation_id}", @@ -713,7 +761,7 @@ async def reprocess_memory(conversation_id: str, transcript_version_id: str, use "version_id": version_id, "transcript_version_id": transcript_version_id, "config_hash": config_hash, - "status": "PENDING" + "status": "QUEUED" }) except Exception as e: diff --git a/backends/advanced/src/advanced_omi_backend/controllers/system_controller.py b/backends/advanced/src/advanced_omi_backend/controllers/system_controller.py index d863985f..095c6801 100644 --- a/backends/advanced/src/advanced_omi_backend/controllers/system_controller.py +++ b/backends/advanced/src/advanced_omi_backend/controllers/system_controller.py @@ -523,9 +523,14 @@ async def list_processing_jobs(): async def process_files_with_content( job_id: str, file_data: list[tuple[str, bytes]], user: User, device_name: str ): - """Background task to process uploaded files using pre-read content.""" + """Background task to process uploaded files using pre-read content. + + Creates persistent clients that remain active in an upload session, + following the same code path as WebSocket clients. + """ # Import here to avoid circular imports - from advanced_omi_backend.main import cleanup_client_state, create_client_state + from advanced_omi_backend.main import create_client_state, cleanup_client_state + import uuid audio_logger.info( f"🚀 process_files_with_content called for job {job_id} with {len(file_data)} files" @@ -536,8 +541,13 @@ async def process_files_with_content( # Update job status to processing await job_tracker.update_job_status(job_id, JobStatus.PROCESSING) + # Process files one by one + processed_files = [] + for file_index, (filename, content) in enumerate(file_data): - client_id = None + # Generate client ID for this file + file_device_name = f"{device_name}-{file_index + 1:03d}" + client_id = generate_client_id(user, file_device_name) client_state = None try: @@ -577,18 +587,22 @@ async def process_files_with_content( ) continue - # Generate unique client ID for each file + # Use pre-generated client ID from upload session file_device_name = f"{device_name}-{file_index + 1:03d}" - client_id = generate_client_id(user, file_device_name) # Update job tracker with client ID await job_tracker.update_file_status( job_id, filename, FileStatus.PROCESSING, client_id=client_id ) - # Create client state + # Create persistent client state (will be tracked by ProcessorManager) client_state = await create_client_state(client_id, user, file_device_name) + + audio_logger.info( + f"👤 [Job {job_id}] Created persistent client {client_id} for file {filename}" + ) + # Process WAV file with wave.open(io.BytesIO(content), "rb") as wav_file: sample_rate = wav_file.getframerate() @@ -732,21 +746,23 @@ async def process_files_with_content( job_id, filename, FileStatus.FAILED, error_message=error_msg ) finally: - # Always clean up client state to prevent accumulation + # Clean up client state immediately after upload completes (like WebSocket disconnect) + # ProcessorManager will continue tracking processing independently if client_id and client_state: try: await cleanup_client_state(client_id) - audio_logger.info( - f"🧹 [Job {job_id}] Cleaned up client state for {client_id}" - ) + audio_logger.info(f"🧹 Cleaned up client state for {client_id}") except Exception as cleanup_error: audio_logger.error( - f"❌ [Job {job_id}] Error cleaning up client state for {client_id}: {cleanup_error}" + f"❌ Error cleaning up client state for {client_id}: {cleanup_error}" ) # Mark job as completed await job_tracker.update_job_status(job_id, JobStatus.COMPLETED) - audio_logger.info(f"🎉 [Job {job_id}] All files processed") + + audio_logger.info( + f"🎉 [Job {job_id}] All files processed successfully." + ) except Exception as e: error_msg = f"Job processing failed: {str(e)}" @@ -754,6 +770,7 @@ async def process_files_with_content( await job_tracker.update_job_status(job_id, JobStatus.FAILED, error_msg) + # Configuration functions moved to config.py to avoid circular imports @@ -1282,3 +1299,6 @@ async def get_client_processing_detail(client_id: str): return JSONResponse( status_code=500, content={"error": f"Failed to get client detail: {str(e)}"} ) + + + diff --git a/backends/advanced/src/advanced_omi_backend/main.py b/backends/advanced/src/advanced_omi_backend/main.py index 1eaafabe..f463f29d 100644 --- a/backends/advanced/src/advanced_omi_backend/main.py +++ b/backends/advanced/src/advanced_omi_backend/main.py @@ -273,6 +273,14 @@ async def cleanup_client_state(client_id: str): removed = await client_manager.remove_client_with_cleanup(client_id) if removed: + # Clean up processor manager task tracking + try: + processor_manager = get_processor_manager() + processor_manager.cleanup_processing_tasks(client_id) + logger.debug(f"Cleaned up processor tasks for client {client_id}") + except Exception as processor_cleanup_error: + logger.error(f"Error cleaning up processor tasks for {client_id}: {processor_cleanup_error}") + # Clean up any orphaned transcript events for this client coordinator = get_transcript_coordinator() coordinator.cleanup_transcript_events_for_client(client_id) @@ -320,6 +328,7 @@ async def lifespan(app: FastAPI): processor_manager = init_processor_manager(CHUNK_DIR, ac_repository) await processor_manager.start() + logger.info("App ready") try: yield @@ -331,6 +340,7 @@ async def lifespan(app: FastAPI): for client_id in client_manager.get_all_client_ids(): await cleanup_client_state(client_id) + # Shutdown processor manager processor_manager = get_processor_manager() await processor_manager.shutdown() diff --git a/backends/advanced/src/advanced_omi_backend/memory/memory_service.py b/backends/advanced/src/advanced_omi_backend/memory/memory_service.py index dc5bc21e..9518d6e1 100644 --- a/backends/advanced/src/advanced_omi_backend/memory/memory_service.py +++ b/backends/advanced/src/advanced_omi_backend/memory/memory_service.py @@ -176,11 +176,13 @@ async def add_memory( created_ids: List[str] = [] # If allow_update, try LLM-driven action proposal + update_processing_successful = False if allow_update and fact_memories_text: memory_logger.info(f"🔍 Allowing update for {source_id}") created_ids = await self._process_memory_updates( fact_memories_text, embeddings, user_id, client_id, source_id, user_email ) + update_processing_successful = True else: memory_logger.info(f"🔍 Not allowing update for {source_id}") # Add all extracted memories normally @@ -197,9 +199,15 @@ async def add_memory( if created_ids and db_helper: await self._update_database_relationships(db_helper, source_id, created_ids) + # Success conditions: + # 1. Normal path: created_ids > 0 (memories were added/updated) + # 2. Update path: LLM successfully processed actions (even if all NONE) if created_ids: memory_logger.info(f"✅ Upserted {len(created_ids)} memories for {source_id}") return True, created_ids + elif update_processing_successful: + memory_logger.info(f"✅ Memory update processing completed for {source_id} - LLM decided no changes needed") + return True, [] error_msg = f"❌ No memories created for {source_id}: memory_entries={len(memory_entries) if memory_entries else 0}, allow_update={allow_update}" memory_logger.error(error_msg) diff --git a/backends/advanced/src/advanced_omi_backend/processors.py b/backends/advanced/src/advanced_omi_backend/processors.py index 4a7343d3..67ea82a9 100644 --- a/backends/advanced/src/advanced_omi_backend/processors.py +++ b/backends/advanced/src/advanced_omi_backend/processors.py @@ -429,10 +429,23 @@ def get_processing_status(self, client_id: str) -> dict[str, Any]: # Check if all stages are complete all_complete = all(stage_info["completed"] for stage_info in stages.values()) + # Get user_id for the client from ClientManager + from advanced_omi_backend.client_manager import get_client_owner + user_id = get_client_owner(client_id) or "Unknown" + + # Determine client type (simple heuristic based on client_id pattern) + # Upload clients have pattern like: "abc123-upload-001", "abc123-upload-001-2", etc. + # They contain "-upload-" in their client_id + # Reprocessing clients have pattern like: "reprocess-{conversation_id}" and should be treated like upload clients + import re + client_type = "upload" if ("-upload-" in client_id or client_id.startswith("reprocess-")) else "websocket" + return { "status": "complete" if all_complete else "processing", "stages": stages, "client_id": client_id, + "user_id": user_id, + "client_type": client_type, } def cleanup_processing_tasks(self, client_id: str): @@ -445,6 +458,167 @@ def cleanup_processing_tasks(self, client_id: str): del self.processing_state[client_id] logger.debug(f"Cleaned up processing state for client {client_id}") + def _is_stale(self, client_id: str, max_idle_minutes: int = 30) -> bool: + """Check if a processing entry is stale (no activity for specified time). + + Args: + client_id: Client ID to check + max_idle_minutes: Maximum idle time in minutes before considering stale + + Returns: + True if the entry is stale and should be cleaned up + """ + import time + + max_idle_seconds = max_idle_minutes * 60 + current_time = time.time() + + # Check processing_state timestamps + if client_id in self.processing_state: + client_state = self.processing_state[client_id] + # Find the most recent timestamp across all stages + latest_timestamp = 0 + for stage_info in client_state.values(): + if isinstance(stage_info, dict) and "timestamp" in stage_info: + latest_timestamp = max(latest_timestamp, stage_info["timestamp"]) + + if latest_timestamp > 0: + idle_time = current_time - latest_timestamp + return idle_time > max_idle_seconds + + # If no processing_state or no valid timestamps, consider it stale + return True + + def _cleanup_completed_entries(self): + """Clean up completed and stale processing entries independently of client lifecycle. + + This method is called from existing processor timeout handlers to maintain + clean processing state without affecting active client sessions. + """ + import time + + clients_to_remove = [] + current_time = time.time() + + for client_id in list(self.processing_state.keys()): + try: + status = self.get_processing_status(client_id) + + # Clean up if processing is complete OR if upload client is done (even with failed stages) + client_type = status.get("client_type", "websocket") + + if status.get("status") == "complete": + if client_type == "upload": + # Upload clients: Clean up immediately when processing completes + clients_to_remove.append((client_id, "completed_upload")) + logger.info(f"Marking completed upload client for immediate cleanup: {client_id}") + + # Also trigger client state cleanup for upload clients + try: + from advanced_omi_backend.main import cleanup_client_state + import asyncio + + # Schedule client cleanup + asyncio.create_task(self._cleanup_upload_client_state(client_id)) + except Exception as cleanup_error: + logger.error(f"Error scheduling upload client cleanup for {client_id}: {cleanup_error}") + else: + # WebSocket clients: Wait for grace period before cleanup + completion_grace_period = 300 # 5 minutes + + # Check if all stages have been complete for grace period + all_stages_old_enough = True + for stage_info in status.get("stages", {}).values(): + if "timestamp" in stage_info: + stage_age = current_time - stage_info["timestamp"] + if stage_age < completion_grace_period: + all_stages_old_enough = False + break + + if all_stages_old_enough: + clients_to_remove.append((client_id, "completed_websocket")) + logger.info(f"Marking completed WebSocket client for cleanup: {client_id}") + + elif client_type == "upload" and status.get("status") == "processing": + # Upload clients: Also clean up if they're done processing (even with failed stages) + # Check if all stages are either completed or have failed (i.e., no longer actively processing) + stages = status.get("stages", {}) + all_stages_done = True + + for stage_name, stage_info in stages.items(): + if not stage_info.get("completed", False) and stage_info.get("status") not in ["failed", "completed"]: + all_stages_done = False + break + + if all_stages_done: + clients_to_remove.append((client_id, "finished_upload")) + logger.info(f"Marking finished upload client for cleanup: {client_id} (some stages may have failed)") + + # Also trigger client state cleanup for upload clients + try: + from advanced_omi_backend.main import cleanup_client_state + import asyncio + + # Schedule client cleanup + asyncio.create_task(self._cleanup_upload_client_state(client_id)) + except Exception as cleanup_error: + logger.error(f"Error scheduling upload client cleanup for {client_id}: {cleanup_error}") + + # Clean up if stale (no activity for 30+ minutes) + elif self._is_stale(client_id, max_idle_minutes=30): + clients_to_remove.append((client_id, "stale")) + logger.info(f"Marking stale processing entry for cleanup: {client_id}") + + except Exception as e: + logger.error(f"Error checking processing status for {client_id}: {e}") + # If we can't check status, consider it for cleanup + clients_to_remove.append((client_id, "error")) + + # Remove the identified entries + for client_id, reason in clients_to_remove: + try: + self._remove_processing_entry(client_id, reason) + except Exception as e: + logger.error(f"Error removing processing entry for {client_id}: {e}") + + async def _cleanup_upload_client_state(self, client_id: str): + """Clean up client state for completed upload clients. + + This method handles the client state cleanup that was previously done + in the background task's finally block, but now happens when processing completes. + """ + try: + from advanced_omi_backend.main import cleanup_client_state + + logger.info(f"🧹 Starting upload client state cleanup for {client_id}") + await cleanup_client_state(client_id) + logger.info(f"✅ Successfully cleaned up upload client state for {client_id}") + + except Exception as e: + logger.error(f"❌ Error cleaning up upload client state for {client_id}: {e}", exc_info=True) + + def _remove_processing_entry(self, client_id: str, reason: str = "cleanup"): + """Remove processing state and task tracking for a client. + + Args: + client_id: Client ID to remove + reason: Reason for removal (for logging) + """ + removed_items = [] + + if client_id in self.processing_state: + del self.processing_state[client_id] + removed_items.append("processing_state") + + if client_id in self.processing_tasks: + del self.processing_tasks[client_id] + removed_items.append("processing_tasks") + + if removed_items: + logger.info(f"🧹 Cleaned up processing entry for {client_id} ({reason}): {', '.join(removed_items)}") + else: + logger.debug(f"No processing entry found to clean up for {client_id} ({reason})") + def get_all_processing_status(self) -> dict[str, Any]: """Get processing status for all clients.""" # Get all client IDs from both tracking types @@ -815,7 +989,7 @@ async def _audio_processor(self): ) except asyncio.TimeoutError: - # Periodic health check + # Periodic health check and cleanup active_clients = len(self.active_file_sinks) queue_size = self.audio_queue.qsize() if queue_size > 0 or active_clients > 0: @@ -824,6 +998,12 @@ async def _audio_processor(self): f"{queue_size} items in queue" ) + # Perform cleanup of completed/stale processing entries + try: + self._cleanup_completed_entries() + except Exception as cleanup_error: + audio_logger.error(f"Error during processing entry cleanup: {cleanup_error}") + except Exception as e: audio_logger.error(f"Fatal error in audio processor: {e}", exc_info=True) finally: diff --git a/backends/advanced/src/advanced_omi_backend/routers/modules/system_routes.py b/backends/advanced/src/advanced_omi_backend/routers/modules/system_routes.py index 494db6ce..21534a6f 100644 --- a/backends/advanced/src/advanced_omi_backend/routers/modules/system_routes.py +++ b/backends/advanced/src/advanced_omi_backend/routers/modules/system_routes.py @@ -189,3 +189,5 @@ async def get_client_processing_detail_route( ): """Get detailed processing information for specific client. Admin only.""" return await system_controller.get_client_processing_detail(client_id) + + diff --git a/backends/advanced/webui/src/pages/Upload.tsx b/backends/advanced/webui/src/pages/Upload.tsx index 04e7d24c..b77005b4 100644 --- a/backends/advanced/webui/src/pages/Upload.tsx +++ b/backends/advanced/webui/src/pages/Upload.tsx @@ -1,6 +1,6 @@ -import React, { useState, useCallback } from 'react' +import React, { useState, useCallback, useEffect } from 'react' import { Upload as UploadIcon, File, X, CheckCircle, AlertCircle, RefreshCw } from 'lucide-react' -import { uploadApi } from '../services/api' +import { uploadApi, systemApi } from '../services/api' import { useAuth } from '../contexts/AuthContext' interface UploadFile { @@ -10,11 +10,63 @@ interface UploadFile { error?: string } +// Legacy JobStatus interface - kept for backward compatibility +interface JobStatus { + job_id: string + status: 'pending' | 'processing' | 'completed' | 'failed' + total_files: number + processed_files: number + current_file?: string + progress_percent: number + files?: Array<{ + filename: string + client_id: string + status: 'pending' | 'processing' | 'completed' | 'failed' + transcription_status?: string + memory_status?: string + error_message?: string + }> +} + +// New unified processing interfaces +interface ProcessingTask { + client_id: string + user_id: string + status: 'processing' | 'complete' + stages: Record +} + +// UploadSessionData interface removed - replaced by unified processor tasks polling + +interface UploadSession { + job_id: string + file_names: string[] + started_at: number + upload_completed: boolean + total_files: number +} + export default function Upload() { const [files, setFiles] = useState([]) - const [isUploading, setIsUploading] = useState(false) const [dragActive, setDragActive] = useState(false) + + // Three-phase state management + const [uploadPhase, setUploadPhase] = useState<'idle' | 'uploading' | 'completed'>('idle') const [uploadProgress, setUploadProgress] = useState(0) + const [processingPhase, setProcessingPhase] = useState<'idle' | 'starting' | 'active' | 'completed'>('idle') + const [jobStatus, setJobStatus] = useState(null) + const [processingTasks, setProcessingTasks] = useState([]) + + // Polling configuration + const [autoRefresh, setAutoRefresh] = useState(true) + const [refreshInterval, setRefreshInterval] = useState(2000) // 2s default for upload page + const [isPolling, setIsPolling] = useState(false) const { isAdmin } = useAuth() @@ -61,10 +113,146 @@ export default function Upload() { handleFileSelect(e.dataTransfer.files) }, []) + // localStorage persistence + const saveSession = (session: UploadSession) => { + localStorage.setItem('upload_session', JSON.stringify(session)) + } + + const getStoredSession = (): UploadSession | null => { + const saved = localStorage.getItem('upload_session') + return saved ? JSON.parse(saved) : null + } + + const clearStoredSession = () => { + localStorage.removeItem('upload_session') + } + + // Resume session on page load + useEffect(() => { + const session = getStoredSession() + if (session) { + setProcessingPhase('active') + setIsPolling(true) + // Use unified polling without session dependency + pollProcessingStatus() + } + }, []) + + // Polling effect + useEffect(() => { + if (!autoRefresh || !isPolling) return + + const interval = setInterval(() => { + pollProcessingStatus() + }, refreshInterval) + + return () => clearInterval(interval) + }, [autoRefresh, refreshInterval, isPolling]) + + // New unified polling approach - polls processor tasks directly without session dependency + const pollProcessingStatus = async () => { + try { + // Get all processor tasks + const tasksResponse = await systemApi.getProcessorTasks() + const allTasks = tasksResponse.data + + // Filter for upload clients (identified by client_id pattern ending with 3-digit numbers like "-001", "-002") + const uploadTasks: ProcessingTask[] = Object.entries(allTasks) + .filter(([clientId, taskData]) => { + // Upload clients have pattern like: "abc123-upload-001", "abc123-upload-002" + return /.*-upload-\d{3}$/.test(clientId) + }) + .map(([clientId, taskData]: [string, any]) => ({ + client_id: clientId, + user_id: taskData?.user_id || 'Unknown', + status: taskData?.status || 'processing', + stages: taskData?.stages || {} + })) + .filter(task => Object.keys(task.stages).length > 0) // Only show clients with active processing + + setProcessingTasks(uploadTasks) + + // Check if all clients are complete OR no upload tasks exist (meaning processing finished) + const allComplete = uploadTasks.length > 0 && uploadTasks.every(task => task.status === 'complete') + const noActiveTasks = uploadTasks.length === 0 && processingPhase === 'active' + + if (allComplete || noActiveTasks) { + setIsPolling(false) + setProcessingPhase('completed') + clearStoredSession() + + setFiles(prevFiles => + prevFiles.map(f => ({ + ...f, + status: 'success' + })) + ) + } else if (uploadTasks.some(task => Object.values(task.stages).some(stage => stage.error))) { + // Check for any errors in processing stages + const hasErrors = uploadTasks.some(task => + Object.values(task.stages).some(stage => stage.error) + ) + + if (hasErrors) { + setFiles(prevFiles => + prevFiles.map(f => ({ + ...f, + status: 'error', + error: 'Processing failed' + })) + ) + } + } + } catch (error) { + console.error('Failed to poll processing status:', error) + } + } + + // Legacy job polling for backward compatibility + const pollJobStatus = async (jobId: string) => { + try { + // Use new unified polling (no session dependency) + await pollProcessingStatus() + + // Also get legacy job status for progress display (if available) + try { + const response = await uploadApi.getJobStatus(jobId) + const status: JobStatus = response.data + setJobStatus(status) + } catch (jobError) { + console.log('Legacy job status not available, using unified polling only') + } + } catch (error) { + console.error('Failed to poll unified processing status:', error) + // Fallback to legacy job polling + try { + const response = await uploadApi.getJobStatus(jobId) + const status: JobStatus = response.data + setJobStatus(status) + + if (status.status === 'completed' || status.status === 'failed') { + setIsPolling(false) + setProcessingPhase('completed') + clearStoredSession() + + setFiles(prevFiles => + prevFiles.map(f => ({ + ...f, + status: status.status === 'completed' ? 'success' : 'error' + })) + ) + } + } catch (fallbackError) { + console.error('All polling methods failed:', fallbackError) + } + } + } + const uploadFiles = async () => { if (files.length === 0) return - setIsUploading(true) + // Phase 1: File Upload + setUploadPhase('uploading') setUploadProgress(0) try { @@ -74,38 +262,66 @@ export default function Upload() { }) // Update all files to uploading status - setFiles(prevFiles => + setFiles(prevFiles => prevFiles.map(f => ({ ...f, status: 'uploading' as const })) ) - await uploadApi.uploadAudioFiles(formData, (progress) => { + // Phase 1: Upload files and get job ID + const response = await uploadApi.uploadAudioFilesAsync(formData, (progress) => { setUploadProgress(progress) }) - - // Mark all files as successful - setFiles(prevFiles => - prevFiles.map(f => ({ ...f, status: 'success' as const })) - ) + + // Phase 2: Job Creation + setUploadPhase('completed') + setProcessingPhase('starting') + + const jobData = response.data + const jobId = jobData.job_id || jobData.jobs?.[0]?.job_id + + if (!jobId) { + throw new Error('No job ID received from server') + } + + // Save session for disconnection handling + const session: UploadSession = { + job_id: jobId, + file_names: files.map(f => f.file.name), + started_at: Date.now(), + upload_completed: true, + total_files: files.length + } + saveSession(session) + + // Phase 3: Start polling for processing status + setProcessingPhase('active') + setIsPolling(true) + pollJobStatus(jobId) } catch (error: any) { console.error('Upload failed:', error) - + + setUploadPhase('idle') + setProcessingPhase('idle') + // Mark all files as failed - setFiles(prevFiles => - prevFiles.map(f => ({ - ...f, - status: 'error' as const, - error: error.message || 'Upload failed' + setFiles(prevFiles => + prevFiles.map(f => ({ + ...f, + status: 'error' as const, + error: error.message || 'Upload failed' })) ) - } finally { - setIsUploading(false) - setUploadProgress(100) } } const clearCompleted = () => { setFiles(files.filter(f => f.status === 'pending' || f.status === 'uploading')) + if (processingPhase === 'completed') { + setProcessingPhase('idle') + setUploadPhase('idle') + setJobStatus(null) + clearStoredSession() + } } const formatFileSize = (bytes: number) => { @@ -205,10 +421,13 @@ export default function Upload() {
@@ -261,12 +480,12 @@ export default function Upload() { )} - {/* Upload Progress */} - {isUploading && ( + {/* Phase 1: Upload Progress */} + {uploadPhase === 'uploading' && (
- Processing audio files... + Uploading files... ({files.length} files) {uploadProgress}% @@ -278,9 +497,124 @@ export default function Upload() { style={{ width: `${uploadProgress}%` }} />
-

- Note: Processing may take up to 5 minutes depending on file size and quantity. -

+
+ )} + + {/* Phase 2: Job Creation */} + {processingPhase === 'starting' && ( +
+
+ + Files uploaded. Starting processing jobs... + + +
+
+ )} + + {/* Phase 3: Processing Status with Configurable Refresh */} + {processingPhase === 'active' && jobStatus && ( +
+ {/* Refresh Controls */} +
+
+ + + +
+ + +
+ + {/* Processing Status */} +
+
+ + Processing file {jobStatus.processed_files + 1}/{jobStatus.total_files} + {jobStatus.current_file && `: ${jobStatus.current_file}`} + + + {Math.round(jobStatus.progress_percent)}% + +
+ +
+
+
+ +

+ Processing may take up to 3x audio duration + 60s. Status updates every {refreshInterval/1000}s. +

+
+ + {/* Per-File Status */} + {jobStatus.files && jobStatus.files.length > 0 && ( +
+

File Processing Status

+
+ {jobStatus.files.map((file, index) => ( +
+ + {file.filename} + +
+ + {file.status.charAt(0).toUpperCase() + file.status.slice(1)} + + {file.status === 'processing' && ( + + )} +
+
+ ))} +
+
+ )} +
+ )} + + {/* Completion Status */} + {processingPhase === 'completed' && ( +
+
+ + + All files processed successfully! Check the Conversations tab to see results. + +
)} @@ -290,10 +624,12 @@ export default function Upload() { 📝 Upload Instructions
    -
  • • Audio files will be processed sequentially for transcription and memory extraction
  • -
  • • Processing time varies based on audio length (roughly 3x the audio duration + 60s)
  • -
  • • Large files or multiple files may cause timeout errors - this is normal
  • -
  • • Check the Conversations tab to see processed results
  • +
  • Phase 1: Files upload quickly to server (progress bar shows transfer)
  • +
  • Phase 2: Processing jobs created (immediate)
  • +
  • Phase 3: Audio processing (transcription + memory extraction, ~3x audio duration)
  • +
  • • You can safely navigate away - processing continues in background
  • +
  • • Refresh rate is configurable (0.5s to 10s) during processing
  • +
  • • Check Conversations tab for final results
  • • Supported formats: WAV, MP3, M4A, FLAC
diff --git a/backends/advanced/webui/src/services/api.ts b/backends/advanced/webui/src/services/api.ts index 9da281e6..32dec703 100644 --- a/backends/advanced/webui/src/services/api.ts +++ b/backends/advanced/webui/src/services/api.ts @@ -141,7 +141,7 @@ export const systemApi = { } export const uploadApi = { - uploadAudioFiles: (files: FormData, onProgress?: (progress: number) => void) => + uploadAudioFiles: (files: FormData, onProgress?: (progress: number) => void) => api.post('/api/process-audio-files', files, { headers: { 'Content-Type': 'multipart/form-data' }, timeout: 300000, // 5 minutes @@ -152,6 +152,27 @@ export const uploadApi = { } } }), + + // Async upload using existing infrastructure - returns job IDs for monitoring + uploadAudioFilesAsync: (files: FormData, onUploadProgress?: (progress: number) => void) => + api.post('/api/process-audio-files-async', files, { + headers: { 'Content-Type': 'multipart/form-data' }, + timeout: 300000, // 5 minutes for upload phase + onUploadProgress: (progressEvent) => { + if (onUploadProgress && progressEvent.total) { + const progress = Math.round((progressEvent.loaded * 100) / progressEvent.total) + onUploadProgress(progress) + } + } + }), + + // Get job status for a specific job + getJobStatus: (jobId: string) => + api.get(`/api/process-audio-files/jobs/${jobId}`), + + // Get status for multiple jobs + getJobStatuses: (jobIds: string[]) => + Promise.all(jobIds.map(jobId => uploadApi.getJobStatus(jobId))) } export const chatApi = { @@ -205,4 +226,6 @@ export const speakerApi = { // Check speaker service status (admin only) getSpeakerServiceStatus: () => api.get('/api/speaker-service-status'), -} \ No newline at end of file +} + +// Upload session API removed - functionality replaced by unified processor tasks polling diff --git a/extras/speaker-recognition/sortformer.py b/extras/speaker-recognition/sortformer.py new file mode 100644 index 00000000..d1990fd1 --- /dev/null +++ b/extras/speaker-recognition/sortformer.py @@ -0,0 +1,350 @@ +#!/usr/bin/env python3 +""" +Test script for NVIDIA SortFormer diarization model with speaker enrollment. +Tests on conversation and enrollment audio files, then maps diarized tracks to enrolled speakers. +""" +import os +import sys +import wave +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +import nemo.collections.asr as nemo_asr +import numpy as np +import soundfile as sf +import torch +import torchaudio +from nemo.collections.asr.models import SortformerEncLabelModel + +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" +TARGET_SR = 16000 + +def get_audio_duration(file_path): + """Get audio duration using wave module.""" + try: + with wave.open(file_path, 'r') as wav_file: + frames = wav_file.getnframes() + sample_rate = wav_file.getframerate() + duration = frames / float(sample_rate) + return duration + except Exception as e: + return 0.0 + +def load_audio_16k_mono(path: str) -> Tuple[torch.Tensor, int]: + """Load audio file and convert to 16kHz mono.""" + wav, sr = torchaudio.load(path) + if wav.shape[0] > 1: + wav = torch.mean(wav, dim=0, keepdim=True) # convert to mono + if sr != TARGET_SR: + wav = torchaudio.functional.resample(wav, sr, TARGET_SR) + return wav.squeeze(0), TARGET_SR + +def write_temp_wav(path: str, wav: torch.Tensor, sr: int = TARGET_SR) -> None: + """Write temporary wav file for embedding extraction.""" + sf.write(path, wav.cpu().numpy(), sr) + +def get_embedding_from_file(speaker_model, file_path: str) -> Optional[torch.Tensor]: + """Extract normalized speaker embedding from audio file.""" + try: + with torch.no_grad(): + emb = speaker_model.get_embedding(file_path) + + # Handle different return types from get_embedding + if isinstance(emb, (list, tuple)): + emb = emb[0] + if isinstance(emb, np.ndarray): + emb = torch.from_numpy(emb) + + emb = emb.float().squeeze().cpu() + # Normalize embedding + return emb / (emb.norm(p=2) + 1e-9) + except Exception as e: + print(f" ERROR extracting embedding from {file_path}: {e}") + return None + +def create_speaker_enrollment(speaker_model, enrollment_files: Dict[str, List[str]]) -> Dict[str, torch.Tensor]: + """Create speaker enrollment centroids from multiple audio files per speaker.""" + enrollment = {} + + print("\n" + "="*60) + print("SPEAKER ENROLLMENT") + print("="*60) + + for speaker_name, file_list in enrollment_files.items(): + print(f"\nEnrolling {speaker_name}...") + embeddings = [] + + for file_path in file_list: + if not os.path.exists(file_path): + print(f" WARNING: {file_path} not found") + continue + + duration = get_audio_duration(file_path) + print(f" Processing {os.path.basename(file_path)} ({duration:.1f}s)...") + + emb = get_embedding_from_file(speaker_model, file_path) + if emb is not None: + embeddings.append(emb) + print(f" ✓ Embedding extracted (shape: {emb.shape})") + + if embeddings: + # Average embeddings to create centroid + centroid = torch.stack(embeddings, dim=0).mean(dim=0) + centroid = centroid / (centroid.norm(p=2) + 1e-9) # normalize + enrollment[speaker_name] = centroid + print(f" ✓ {speaker_name} enrolled with {len(embeddings)} samples") + print(f" Centroid shape: {centroid.shape}") + else: + print(f" ✗ Failed to enroll {speaker_name} - no valid embeddings") + + return enrollment + +def extract_segments_embeddings(speaker_model, audio_file: str, segments: List) -> Dict[int, torch.Tensor]: + """Extract embeddings for each diarized speaker track.""" + print("\n" + "="*60) + print("EXTRACTING TRACK EMBEDDINGS") + print("="*60) + + # Load full audio + full_wav, sr = load_audio_16k_mono(audio_file) + + # Group segments by speaker + speaker_segments = {} + for seg in segments: + start, end, spk_idx = float(seg[0]), float(seg[1]), int(seg[2]) + speaker_segments.setdefault(spk_idx, []).append((start, end)) + + # Create temp directory for segment files + temp_dir = "tmp_segments" + os.makedirs(temp_dir, exist_ok=True) + + track_embeddings = {} + + for spk_idx, seg_list in speaker_segments.items(): + print(f"\nProcessing Speaker Track {spk_idx}...") + print(f" Found {len(seg_list)} segments") + + seg_embeddings = [] + + for i, (start_sec, end_sec) in enumerate(seg_list): + # Extract audio segment + start_samp = int(start_sec * TARGET_SR) + end_samp = int(end_sec * TARGET_SR) + segment_wav = full_wav[start_samp:end_samp].clone() + + # Skip very short segments + if segment_wav.numel() < TARGET_SR // 10: # < 0.1 seconds + print(f" Skipping segment {i+1} (too short: {len(segment_wav)/TARGET_SR:.2f}s)") + continue + + # Write temporary file + temp_path = os.path.join(temp_dir, f"spk{spk_idx}_{i:03d}.wav") + write_temp_wav(temp_path, segment_wav, TARGET_SR) + + # Extract embedding + emb = get_embedding_from_file(speaker_model, temp_path) + if emb is not None: + seg_embeddings.append(emb) + print(f" ✓ Segment {i+1}: {start_sec:.2f}-{end_sec:.2f}s -> embedding extracted") + + # Clean up temp file + try: + os.remove(temp_path) + except: + pass + + if seg_embeddings: + # Average embeddings for this speaker track + track_emb = torch.stack(seg_embeddings, dim=0).mean(dim=0) + track_emb = track_emb / (track_emb.norm(p=2) + 1e-9) # normalize + track_embeddings[spk_idx] = track_emb + print(f" ✓ Track {spk_idx}: {len(seg_embeddings)} segments -> final embedding") + else: + print(f" ✗ Track {spk_idx}: No valid embeddings extracted") + + # Clean up temp directory + try: + os.rmdir(temp_dir) + except: + pass + + return track_embeddings + +def map_speakers_to_enrollment(track_embeddings: Dict[int, torch.Tensor], + enrollment: Dict[str, torch.Tensor], + similarity_threshold: float = 0.0) -> Dict[int, str]: + """Map diarized speaker tracks to enrolled speaker identities.""" + print("\n" + "="*60) + print("SPEAKER IDENTITY MAPPING") + print("="*60) + + def cosine_similarity(a: torch.Tensor, b: torch.Tensor) -> float: + """Calculate cosine similarity between two embeddings.""" + return float(torch.dot(a, b) / ((a.norm(p=2) + 1e-9) * (b.norm(p=2) + 1e-9))) + + speaker_mapping = {} + + print(f"Similarity threshold: {similarity_threshold}") + print(f"Available enrolled speakers: {list(enrollment.keys())}") + + for track_idx, track_emb in track_embeddings.items(): + print(f"\nMapping Track {track_idx}:") + + best_match = None + best_similarity = -1.0 + similarities = {} + + # Compare with all enrolled speakers + for speaker_name, enrolled_emb in enrollment.items(): + similarity = cosine_similarity(track_emb, enrolled_emb) + similarities[speaker_name] = similarity + print(f" vs {speaker_name}: {similarity:.4f}") + + if similarity > best_similarity: + best_similarity = similarity + best_match = speaker_name + + # Assign identity based on threshold + if best_similarity >= similarity_threshold and best_match: + speaker_mapping[track_idx] = best_match + print(f" → Track {track_idx} mapped to: {best_match} (confidence: {best_similarity:.4f})") + else: + speaker_mapping[track_idx] = f"unknown_spk{track_idx}" + print(f" → Track {track_idx} mapped to: unknown_spk{track_idx} (low confidence: {best_similarity:.4f})") + + return speaker_mapping + +def generate_labeled_segments(segments: List, speaker_mapping: Dict[int, str]) -> List[Dict]: + """Generate final segments with speaker labels.""" + labeled_segments = [] + + for seg in segments: + start, end, spk_idx = float(seg[0]), float(seg[1]), int(seg[2]) + speaker_name = speaker_mapping.get(spk_idx, f"spk{spk_idx}") + + labeled_segments.append({ + "start": start, + "end": end, + "speaker": speaker_name, + "duration": end - start + }) + + return labeled_segments + +def test_sortformer_with_enrollment(): + """Test SortFormer diarization with speaker enrollment and mapping.""" + # Audio file paths + test_files = { + "conversation": "tests/assets/conversation_evan_katelyn_2min.wav", + "evan_enrollment": [ + "tests/assets/evan/evan_001.wav", + "tests/assets/evan/evan_002.wav", + "tests/assets/evan/evan_003.wav", + "tests/assets/evan/evan_004.wav" + ], + "katelyn_enrollment": [ + "tests/assets/katelyn/katelyn_001.wav", + "tests/assets/katelyn/katelyn_002.wav" + ] + } + + # Check if files exist + print("Checking audio files...") + for category, files in test_files.items(): + if isinstance(files, str): + files = [files] + for file_path in files: + if not os.path.exists(file_path): + print(f"WARNING: {file_path} not found") + else: + duration = get_audio_duration(file_path) + print(f"✓ {file_path} (duration: {duration:.1f}s)") + + print(f"\nLoading models on {DEVICE}...") + try: + # Load diarization model + diar_model = SortformerEncLabelModel.from_pretrained("nvidia/diar_streaming_sortformer_4spk-v2").to(DEVICE) + diar_model.eval() + print("✓ SortFormer diarization model loaded") + + # Load speaker verification model + speaker_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large").to(DEVICE) + speaker_model.eval() + print("✓ TitaNet speaker embedding model loaded") + + except Exception as e: + print(f"ERROR loading models: {e}") + return + + # Test basic diarization first + conversation_file = test_files["conversation"] + if not os.path.exists(conversation_file): + print(f"ERROR: Conversation file not found: {conversation_file}") + return + + print(f"\n{'='*60}") + print(f"BASIC DIARIZATION TEST: {conversation_file}") + print('='*60) + + try: + segments = diar_model.diarize(audio=conversation_file, batch_size=1) + print(f"\nFound {len(segments)} diarized segments:") + for i, segment in enumerate(segments): + start, end, spk = float(segment[0]), float(segment[1]), int(segment[2]) + print(f" {i+1:2d}: {start:6.2f}-{end:6.2f}s | Speaker {spk} | Duration: {end-start:.2f}s") + + except Exception as e: + print(f"ERROR during diarization: {e}") + return + + # Create speaker enrollment + enrollment_files = { + "Evan": test_files["evan_enrollment"], + "Katelyn": test_files["katelyn_enrollment"] + } + + enrollment = create_speaker_enrollment(speaker_model, enrollment_files) + + if not enrollment: + print("ERROR: No speakers enrolled successfully") + return + + # Extract embeddings for diarized tracks + track_embeddings = extract_segments_embeddings(speaker_model, conversation_file, segments) + + if not track_embeddings: + print("ERROR: No track embeddings extracted") + return + + # Map speaker tracks to enrolled identities + speaker_mapping = map_speakers_to_enrollment(track_embeddings, enrollment, similarity_threshold=0.3) + + # Generate final labeled segments + labeled_segments = generate_labeled_segments(segments, speaker_mapping) + + # Display results + print("\n" + "="*60) + print("FINAL RESULTS WITH SPEAKER LABELS") + print("="*60) + + print(f"\nLabeled segments ({len(labeled_segments)} total):") + for i, seg in enumerate(labeled_segments): + print(f" {i+1:2d}: {seg['start']:6.2f}-{seg['end']:6.2f}s | {seg['speaker']:12s} | {seg['duration']:.2f}s") + + # Summary by speaker + print(f"\nSpeaker summary:") + speaker_stats = {} + for seg in labeled_segments: + speaker = seg['speaker'] + speaker_stats.setdefault(speaker, {'count': 0, 'total_duration': 0.0}) + speaker_stats[speaker]['count'] += 1 + speaker_stats[speaker]['total_duration'] += seg['duration'] + + for speaker, stats in speaker_stats.items(): + print(f" {speaker:12s}: {stats['count']:2d} segments, {stats['total_duration']:6.1f}s total") + +if __name__ == "__main__": + print("SortFormer Diarization + Speaker Enrollment Test Script") + print("=" * 60) + test_sortformer_with_enrollment() + print("\nTest completed!") \ No newline at end of file From 32cd0dfb69ac5eb576a966228b38b4f587003906 Mon Sep 17 00:00:00 2001 From: Ankush Malaker <43288948+AnkushMalaker@users.noreply.github.com> Date: Fri, 19 Sep 2025 21:24:03 +0000 Subject: [PATCH 3/7] fix unused fns and vars --- .../advanced/webui/src/pages/Processes.tsx | 22 +------------------ backends/advanced/webui/src/pages/System.tsx | 3 --- 2 files changed, 1 insertion(+), 24 deletions(-) diff --git a/backends/advanced/webui/src/pages/Processes.tsx b/backends/advanced/webui/src/pages/Processes.tsx index 0eaf050f..67a9733c 100644 --- a/backends/advanced/webui/src/pages/Processes.tsx +++ b/backends/advanced/webui/src/pages/Processes.tsx @@ -1,5 +1,5 @@ import { useState, useEffect } from 'react' -import { Activity, RefreshCw, Users, Clock, BarChart3 } from 'lucide-react' +import { Activity, RefreshCw } from 'lucide-react' import { systemApi } from '../services/api' import { useAuth } from '../contexts/AuthContext' import ProcessPipelineView from '../components/processes/ProcessPipelineView' @@ -45,26 +45,6 @@ interface ProcessingHistoryItem { error?: string } -interface ClientProcessingDetail { - client_id: string - client_info: { - user_id: string - user_email: string - current_audio_uuid?: string - conversation_start_time?: string - sample_rate?: number - } - processing_status: any - active_tasks: Array<{ - task_id: string - task_name: string - task_type: string - created_at: string - completed_at?: string - error?: string - cancelled: boolean - }> -} export default function Processes() { const [overviewData, setOverviewData] = useState(null) diff --git a/backends/advanced/webui/src/pages/System.tsx b/backends/advanced/webui/src/pages/System.tsx index c1283660..9c1b34eb 100644 --- a/backends/advanced/webui/src/pages/System.tsx +++ b/backends/advanced/webui/src/pages/System.tsx @@ -144,9 +144,6 @@ export default function System() { return displayNames[service] || service.replace('_', ' ').toUpperCase() } - const formatDate = (dateString: string) => { - return new Date(dateString).toLocaleString() - } if (!isAdmin) { return ( From 9903b88e9ae107932778a93e8ee4307f4eec913b Mon Sep 17 00:00:00 2001 From: Ankush Malaker <43288948+AnkushMalaker@users.noreply.github.com> Date: Fri, 19 Sep 2025 21:34:57 +0000 Subject: [PATCH 4/7] tiny fix --- backends/advanced/webui/src/pages/Upload.tsx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/backends/advanced/webui/src/pages/Upload.tsx b/backends/advanced/webui/src/pages/Upload.tsx index b77005b4..6fc487e1 100644 --- a/backends/advanced/webui/src/pages/Upload.tsx +++ b/backends/advanced/webui/src/pages/Upload.tsx @@ -61,7 +61,6 @@ export default function Upload() { const [uploadProgress, setUploadProgress] = useState(0) const [processingPhase, setProcessingPhase] = useState<'idle' | 'starting' | 'active' | 'completed'>('idle') const [jobStatus, setJobStatus] = useState(null) - const [processingTasks, setProcessingTasks] = useState([]) // Polling configuration const [autoRefresh, setAutoRefresh] = useState(true) @@ -158,7 +157,7 @@ export default function Upload() { // Filter for upload clients (identified by client_id pattern ending with 3-digit numbers like "-001", "-002") const uploadTasks: ProcessingTask[] = Object.entries(allTasks) - .filter(([clientId, taskData]) => { + .filter(([clientId]) => { // Upload clients have pattern like: "abc123-upload-001", "abc123-upload-002" return /.*-upload-\d{3}$/.test(clientId) }) @@ -170,7 +169,6 @@ export default function Upload() { })) .filter(task => Object.keys(task.stages).length > 0) // Only show clients with active processing - setProcessingTasks(uploadTasks) // Check if all clients are complete OR no upload tasks exist (meaning processing finished) const allComplete = uploadTasks.length > 0 && uploadTasks.every(task => task.status === 'complete') From 38c7c36ffe281eec1f42770a925d523085540544 Mon Sep 17 00:00:00 2001 From: Ankush Malaker <43288948+AnkushMalaker@users.noreply.github.com> Date: Fri, 19 Sep 2025 21:38:24 +0000 Subject: [PATCH 5/7] reduce claude code context --- CLAUDE.md | 912 ++--------------------------- docs/api-reference.md | 151 +++++ docs/distributed-deployment.md | 178 ++++++ {Docs => docs}/features.md | 0 {Docs => docs}/init-system.md | 0 docs/memory-providers.md | 182 ++++++ {Docs => docs}/ports-and-access.md | 0 docs/speaker-recognition.md | 73 +++ docs/versioned-processing.md | 166 ++++++ docs/wyoming-protocol.md | 79 +++ 10 files changed, 862 insertions(+), 879 deletions(-) create mode 100644 docs/api-reference.md create mode 100644 docs/distributed-deployment.md rename {Docs => docs}/features.md (100%) rename {Docs => docs}/init-system.md (100%) create mode 100644 docs/memory-providers.md rename {Docs => docs}/ports-and-access.md (100%) create mode 100644 docs/speaker-recognition.md create mode 100644 docs/versioned-processing.md create mode 100644 docs/wyoming-protocol.md diff --git a/CLAUDE.md b/CLAUDE.md index aadafd4c..93cf1e55 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Project Overview -Friend-Lite is at the core an AI-powered personal system - various devices, incuding but not limited to wearables from OMI can be used for at the very least audio capture, speaker specific transcription, memory extraction and retriaval. +Friend-Lite is at the core an AI-powered personal system - various devices, including but not limited to wearables from OMI can be used for at the very least audio capture, speaker specific transcription, memory extraction and retrieval. On top of that - it is being designed to support other services, that can help a user with these inputs such as reminders, action items, personal diagnosis etc. This supports a comprehensive web dashboard for management. @@ -76,17 +76,6 @@ source .env && export DEEPGRAM_API_KEY && export OPENAI_API_KEY uv run pytest tests/test_integration.py::test_full_pipeline_integration -v -s ``` -#### Speaker Recognition Tests -```bash -cd extras/speaker-recognition - -# Requires .env file with HF_TOKEN and DEEPGRAM_API_KEY -cp .env.template .env # Configure tokens - -# Run speaker recognition test suite -./run-test.sh -``` - #### Test Script Features - **Environment Compatibility**: Works with both local .env files and CI environment variables - **Simplified Configuration**: Uses environment variables directly, no temporary .env.test files @@ -96,15 +85,6 @@ cp .env.template .env # Configure tokens - **Timeout Protection**: 15-minute timeout for advanced backend, 30-minute for speaker recognition - **Fresh Testing**: Uses CACHED_MODE=False for clean test environments -#### Debugging Integration Tests -For advanced debugging, you can still use the cached mode approach: - -1. **Edit tests/test_integration.py**: Set CACHED_MODE = True -2. **Run test manually**: `uv run pytest tests/test_integration.py -v -s --tb=short` -3. **Debug containers**: `docker logs advanced-backend-friend-backend-test-1 --tail=100` -4. **Test endpoints**: `curl -X GET http://localhost:8001/health` -5. **Clean up**: `docker compose -f docker-compose-test.yml down -v` - ### Mobile App Development ```bash cd app @@ -196,105 +176,6 @@ Optional: - Efficient storage utilization for speech-only content - Automatic quality filtering without manual intervention -### Versioned Transcript and Memory System - -**Version Architecture**: -- **`transcript_versions`**: Array of transcript processing attempts with timestamps and providers -- **`memory_versions`**: Array of memory extraction attempts with different models/prompts -- **`active_transcript_version`**: Pointer to currently displayed transcript -- **`active_memory_version`**: Pointer to currently active memory extraction - -**Reprocessing Capabilities**: -- **Transcript Reprocessing**: Re-run speech-to-text with different providers or settings -- **Memory Reprocessing**: Re-extract memories using different LLM models or prompts -- **Version Management**: Switch between different processing results -- **Backward Compatibility**: Legacy fields auto-populated from active versions - -**Data Consistency**: -- All reprocessing operations use `conversation_id` (not `audio_uuid`) -- DateTime objects stored as ISO strings for MongoDB/JSON compatibility -- Legacy field support ensures existing integrations continue working - -### Database Schema Details - -**Collections Overview**: -- **`audio_chunks`**: All audio sessions by `audio_uuid` (always created) -- **`conversations`**: Speech-detected conversations by `conversation_id` (created conditionally) -- **`users`**: User accounts and authentication data - -**Speech-Driven Schema**: -```javascript -// audio_chunks collection (always created) -{ - "_id": ObjectId, - "audio_uuid": "uuid", // Primary identifier - "user_id": ObjectId, - "client_id": "user_suffix-device_name", - "audio_file_path": "/path/to/audio.wav", - "created_at": ISODate, - "transcript": "fallback transcript", // For non-speech audio - "segments": [...], // Speaker segments - "has_speech": boolean, // Speech detection result - "speech_analysis": {...}, // Detection metadata - "conversation_id": "conv_id" | null // Link to conversations collection -} - -// conversations collection (speech-detected only) -{ - "_id": ObjectId, - "conversation_id": "conv_uuid", // Primary identifier for user-facing operations - "audio_uuid": "audio_uuid", // Link to audio_chunks - "user_id": ObjectId, - "client_id": "user_suffix-device_name", - "created_at": ISODate, - - // Versioned Transcript System - "transcript_versions": [ - { - "version_id": "uuid", - "transcript": "text content", - "segments": [...], // Speaker diarization - "provider": "deepgram|mistral|parakeet", - "model": "nova-3|voxtral-mini-2507", - "created_at": ISODate, - "processing_time_seconds": 12.5, - "metadata": {...} - } - ], - "active_transcript_version": "uuid", // Points to current version - - // Versioned Memory System - "memory_versions": [ - { - "version_id": "uuid", - "memory_count": 5, - "transcript_version_id": "uuid", // Which transcript was used - "provider": "friend_lite|openmemory_mcp", - "model": "gpt-4o-mini|ollama-llama3", - "created_at": ISODate, - "processing_time_seconds": 45.2, - "metadata": {...} - } - ], - "active_memory_version": "uuid", // Points to current version - - // Legacy Fields (auto-populated from active versions) - "transcript": "text", // From active_transcript_version - "segments": [...], // From active_transcript_version - "memories": [...], // From active_memory_version - "memory_count": 5 // From active_memory_version -} -``` - -**Key Architecture Benefits**: -- **Clean Separation**: Raw audio storage vs user-facing conversations -- **Speech Filtering**: Only meaningful conversations appear in UI -- **Version History**: Complete audit trail of processing attempts -- **Backward Compatibility**: Legacy fields ensure existing code works -- **Reprocessing Support**: Easy to re-run with different providers/models -- **Service Decoupling**: Conversation creation independent of memory processing -- **Error Isolation**: Memory service failures don't affect conversation storage - ## Authentication & Security - **User System**: Email-based authentication with MongoDB ObjectId user IDs @@ -323,7 +204,7 @@ DEEPGRAM_API_KEY=your-deepgram-key-here # Optional: PARAKEET_ASR_URL=http://host.docker.internal:8767 # Optional: TRANSCRIPTION_PROVIDER=deepgram -# Memory Provider (New) +# Memory Provider MEMORY_PROVIDER=friend_lite # or openmemory_mcp # Database @@ -340,7 +221,7 @@ CORS_ORIGINS=http://localhost:3000,http://localhost:5173 ### Memory Provider Configuration -Friend-Lite now supports two pluggable memory backends: +Friend-Lite supports two pluggable memory backends: #### Friend-Lite Memory Provider (Default) ```bash @@ -371,81 +252,6 @@ OPENMEMORY_TIMEOUT=30 OPENAI_API_KEY=your-openai-key-here ``` -#### OpenMemory MCP Interface Patterns - -**Important**: OpenMemory MCP stores memories **per-app**, not globally. Understanding this architecture is critical for proper integration. - -**App-Based Storage Architecture:** -- All memories are stored under specific "apps" (namespaces) -- Generic endpoints (`/api/v1/memories/`) return empty results -- App-specific endpoints (`/api/v1/apps/{app_id}/memories`) contain the actual memories - -**Hardcoded Values and Configuration:** -```bash -# Default app name (configurable via OPENMEMORY_CLIENT_NAME) -Default: "friend_lite" - -# Hardcoded metadata (NOT configurable) -"source": "friend_lite" # Always hardcoded in Friend-Lite - -# User ID for OpenMemory MCP server -OPENMEMORY_USER_ID=openmemory # Configurable -``` - -**API Interface Pattern:** -```python -# 1. App Discovery - Find app by client_name -GET /api/v1/apps/ -# Response: {"apps": [{"id": "uuid", "name": "friend_lite", ...}]} - -# 2. Memory Creation - Uses generic endpoint but assigns to app -POST /api/v1/memories/ -{ - "user_id": "openmemory", - "text": "memory content", - "app": "friend_lite", # Uses OPENMEMORY_CLIENT_NAME - "metadata": { - "source": "friend_lite", # Hardcoded - "client": "friend_lite" # Uses OPENMEMORY_CLIENT_NAME - } -} - -# 3. Memory Retrieval - Must use app-specific endpoint -GET /api/v1/apps/{app_id}/memories?user_id=openmemory&page=1&size=10 - -# 4. Memory Search - Must use app-specific endpoint with search_query -GET /api/v1/apps/{app_id}/memories?user_id=openmemory&search_query=keyword&page=1&size=10 -``` - -**Friend-Lite Integration Flow:** -1. **App Discovery**: Query `/api/v1/apps/` to find app matching `OPENMEMORY_CLIENT_NAME` -2. **Fallback**: If client app not found, use first available app -3. **Operations**: All memory operations use the app-specific endpoints with discovered `app_id` - -**Testing OpenMemory MCP Integration:** -```bash -# Configure .env file with OpenMemory MCP settings -cp .env.template .env -# Edit .env to set MEMORY_PROVIDER=openmemory_mcp and configure OPENMEMORY_* variables - -# Start OpenMemory MCP server -cd extras/openmemory-mcp && docker compose up -d - -# Run integration tests (reads configuration from .env file) -cd backends/advanced && ./run-test.sh - -# Manual testing - Check app structure -curl -s "http://localhost:8765/api/v1/apps/" | jq - -# Test memory creation -curl -X POST "http://localhost:8765/api/v1/memories/" \ - -H "Content-Type: application/json" \ - -d '{"user_id": "openmemory", "text": "test memory", "app": "friend_lite"}' - -# Retrieve memories (replace app_id with actual ID from apps endpoint) -curl -s "http://localhost:8765/api/v1/apps/{app_id}/memories?user_id=openmemory" | jq -``` - ### Transcription Provider Configuration Friend-Lite supports multiple transcription services: @@ -473,295 +279,40 @@ OLLAMA_BASE_URL=http://ollama:11434 SPEAKER_SERVICE_URL=http://speaker-recognition:8085 ``` -## Transcription Architecture - -### Provider System -Friend-Lite supports multiple transcription providers: - -**Online Providers (API-based):** -- **Deepgram**: High-quality transcription using Nova-3 model with real-time streaming -- **Mistral**: Voxtral models for transcription with REST API processing +## Quick API Reference -**Offline Providers (Local processing):** -- **Parakeet**: Local speech recognition service available in extras/asr-services - -**Provider Interface:** -The transcription system handles: -- Connection management and health checks -- Audio format handling (streaming vs batch) -- Error handling and reconnection -- Unified transcript format normalization - -## Wyoming Protocol Implementation - -### Overview -The system uses Wyoming protocol for WebSocket communication between mobile apps and backends. Wyoming is a peer-to-peer protocol for voice assistants that combines JSONL headers with binary audio payloads. - -### Protocol Format -``` -{JSON_HEADER}\n - -``` - -### Supported Events - -#### Audio Session Events -- **audio-start**: Signals the beginning of an audio recording session - ```json - {"type": "audio-start", "data": {"rate": 16000, "width": 2, "channels": 1}, "payload_length": null} - ``` - -- **audio-chunk**: Contains raw audio data with format metadata - ```json - {"type": "audio-chunk", "data": {"rate": 16000, "width": 2, "channels": 1}, "payload_length": 320} - <320 bytes of PCM/Opus audio data> - ``` - -- **audio-stop**: Signals the end of an audio recording session - ```json - {"type": "audio-stop", "data": {"timestamp": 1234567890}, "payload_length": null} - ``` - -### Backend Implementation - -#### Advanced Backend (`/ws_pcm`) -- **Full Wyoming Protocol Support**: Parses all Wyoming events for session management -- **Session Tracking**: Only processes audio chunks when session is active (after audio-start) -- **Conversation Boundaries**: Uses audio-start/stop events to define conversation segments -- **Backward Compatibility**: Fallback to raw binary audio for older clients - -#### Simple Backend (`/ws`) -- **Minimal Wyoming Support**: Parses audio-chunk events, ignores others -- **Opus Processing**: Handles Opus-encoded audio chunks from Wyoming protocol -- **Graceful Degradation**: Falls back to raw Opus packets for compatibility - -### Mobile App Integration - -Mobile apps should implement Wyoming protocol for proper session management: - -```javascript -// Start audio session -const audioStart = { - type: "audio-start", - data: { rate: 16000, width: 2, channels: 1 }, - payload_length: null -}; -websocket.send(JSON.stringify(audioStart) + '\n'); - -// Send audio chunks -const audioChunk = { - type: "audio-chunk", - data: { rate: 16000, width: 2, channels: 1 }, - payload_length: audioData.byteLength -}; -websocket.send(JSON.stringify(audioChunk) + '\n'); -websocket.send(audioData); - -// End audio session -const audioStop = { - type: "audio-stop", - data: { timestamp: Date.now() }, - payload_length: null -}; -websocket.send(JSON.stringify(audioStop) + '\n'); -``` - -### Benefits -- **Clear Session Boundaries**: No timeout-based conversation detection needed -- **Structured Communication**: Consistent protocol across all audio streaming -- **Future Extensibility**: Room for additional event types (pause, resume, metadata) -- **Backward Compatibility**: Works with existing raw audio streaming clients - -## Memory System Architecture - -### Overview -Friend-Lite supports two pluggable memory backends that can be selected via configuration: - -#### 1. Friend-Lite Memory Provider (`friend_lite`) -The sophisticated in-house memory implementation with full control and customization: - -**Features:** -- Custom LLM-powered memory extraction with enhanced prompts -- Individual fact storage (no JSON blobs) -- Smart deduplication algorithms -- Intelligent memory updates (ADD/UPDATE/DELETE decisions) -- **Semantic search** with relevance threshold filtering -- **Memory count API** with total count tracking from native Qdrant -- Direct Qdrant vector storage with accurate similarity scoring -- Custom memory prompts and processing -- No external dependencies - -**Architecture Flow:** -1. **Audio Input** → Transcription via Deepgram/Parakeet -2. **Memory Extraction** → LLM processes transcript using custom prompts -3. **Fact Parsing** → XML/JSON parsing into individual memory entries -4. **Deduplication** → Smart algorithms prevent duplicate memories -5. **Vector Storage** → Direct Qdrant storage with embeddings -6. **Memory Updates** → LLM-driven action proposals (ADD/UPDATE/DELETE) - -#### 2. OpenMemory MCP Provider (`openmemory_mcp`) -Thin client that delegates all memory processing to external OpenMemory MCP server: - -**Features:** -- Professional memory extraction (handled by OpenMemory) -- Battle-tested deduplication (handled by OpenMemory) -- Semantic vector search (handled by OpenMemory) -- ACL-based user isolation (handled by OpenMemory) -- Cross-client compatibility (Claude Desktop, Cursor, Windsurf) -- Web UI for memory management at http://localhost:8765 - -**Architecture Flow:** -1. **Audio Input** → Transcription via Deepgram/Parakeet -2. **MCP Delegation** → Send enriched transcript to OpenMemory MCP server -3. **External Processing** → OpenMemory handles extraction, deduplication, storage -4. **Result Mapping** → Convert MCP results to Friend-Lite MemoryEntry format -5. **Client Management** → Automatic user context switching via MCP client - -### Memory Provider Comparison - -| Feature | Friend-Lite | OpenMemory MCP | -|---------|-------------|----------------| -| **Processing** | Custom LLM extraction | Delegates to OpenMemory | -| **Deduplication** | Custom algorithms | OpenMemory handles | -| **Vector Storage** | Direct Qdrant | OpenMemory handles | -| **Search Features** | Semantic search with threshold filtering | Semantic search with relevance scoring | -| **Memory Count** | Native Qdrant count API | Varies by OpenMemory support | -| **Dependencies** | Qdrant + MongoDB | External OpenMemory server | -| **Customization** | Full control | Limited to OpenMemory features | -| **Cross-client** | Friend-Lite only | Works with Claude Desktop, Cursor, etc | -| **Web UI** | Friend-Lite WebUI with advanced search | OpenMemory UI + Friend-Lite WebUI | -| **Memory Format** | Individual facts | OpenMemory format | -| **Setup Complexity** | Medium | High (external server required) | - -### Switching Memory Providers - -You can switch providers by changing the `MEMORY_PROVIDER` environment variable: +### Common Endpoints +- **GET /health**: Basic application health check +- **GET /readiness**: Service dependency validation +- **WS /ws_pcm**: Primary audio streaming endpoint (Wyoming protocol + raw PCM fallback) +- **GET /api/conversations**: User's conversations with transcripts +- **GET /api/memories/search**: Semantic memory search with relevance scoring +- **POST /auth/jwt/login**: Email-based login (returns JWT token) +### Authentication Flow ```bash -# Switch to OpenMemory MCP -echo "MEMORY_PROVIDER=openmemory_mcp" >> .env +# 1. Get auth token +curl -s -X POST \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "username=admin@example.com&password=your-password-here" \ + http://localhost:8000/auth/jwt/login -# Switch back to Friend-Lite -echo "MEMORY_PROVIDER=friend_lite" >> .env +# 2. Use token in API calls +curl -s -H "Authorization: Bearer YOUR_TOKEN" \ + http://localhost:8000/api/conversations ``` -**Note:** Existing memories are not automatically migrated between providers. Each provider maintains its own memory storage. - -### OpenMemory MCP Setup - -To use the OpenMemory MCP provider: - +### Development Reset Commands ```bash -# 1. Start external OpenMemory MCP server -cd extras/openmemory-mcp -docker compose up -d - -# 2. Configure Friend-Lite to use OpenMemory MCP +# Reset all data (development only) cd backends/advanced -echo "MEMORY_PROVIDER=openmemory_mcp" >> .env +sudo rm -rf data/ -# 3. Start Friend-Lite backend +# Reset Docker volumes +docker compose down -v docker compose up --build -d ``` -### When to Use Each Provider - -**Use Friend-Lite when:** -- You want full control over memory processing -- You need custom memory extraction logic -- You prefer fewer external dependencies -- You want to customize memory prompts and algorithms -- You need individual fact-based memory storage - -**Use OpenMemory MCP when:** -- You want professional, battle-tested memory processing -- You need cross-client compatibility (Claude Desktop, Cursor, etc.) -- You prefer to leverage external expertise rather than maintain custom logic -- You want access to OpenMemory's web interface -- You're already using OpenMemory in other tools - -## Versioned Processing System - -### Overview - -Friend-Lite implements a comprehensive versioning system for both transcript and memory processing, allowing multiple processing attempts with different providers, models, or settings while maintaining a clean user experience. - -### Version Data Structure - -**Transcript Versions**: -```json -{ - "transcript_versions": [ - { - "version_id": "uuid", - "transcript": "processed text", - "segments": [...], - "provider": "deepgram|mistral|parakeet", - "model": "nova-3|voxtral-mini-2507", - "created_at": "2025-01-15T10:30:00Z", - "processing_time_seconds": 12.5, - "metadata": { - "confidence_scores": [...], - "speaker_diarization": true - } - } - ], - "active_transcript_version": "uuid" -} -``` - -**Memory Versions**: -```json -{ - "memory_versions": [ - { - "version_id": "uuid", - "memory_count": 5, - "transcript_version_id": "uuid", - "provider": "friend_lite|openmemory_mcp", - "model": "gpt-4o-mini|ollama-llama3", - "created_at": "2025-01-15T10:32:00Z", - "processing_time_seconds": 45.2, - "metadata": { - "prompt_version": "v2.1", - "extraction_quality": "high" - } - } - ], - "active_memory_version": "uuid" -} -``` - -### Reprocessing Workflows - -**Transcript Reprocessing**: -1. Trigger via API: `POST /api/conversations/{conversation_id}/reprocess-transcript` -2. System creates new transcript version with different provider/model -3. New version added to `transcript_versions` array -4. User can activate any version via `activate-transcript` endpoint -5. Legacy `transcript` field automatically updated from active version - -**Memory Reprocessing**: -1. Trigger via API: `POST /api/conversations/{conversation_id}/reprocess-memory` -2. Specify which transcript version to use as input -3. System creates new memory version using specified transcript -4. New version added to `memory_versions` array -5. User can activate any version via `activate-memory` endpoint -6. Legacy `memories` field automatically updated from active version - -### Legacy Field Compatibility - -**Automatic Population**: -- `transcript`: Auto-populated from active transcript version -- `segments`: Auto-populated from active transcript version -- `memories`: Auto-populated from active memory version -- `memory_count`: Auto-populated from active memory version - -**Backward Compatibility**: -- Existing API clients continue working without modification -- WebUI displays active versions by default -- Advanced users can access version history and switch between versions - ## Development Notes ### Package Management @@ -815,412 +366,15 @@ The system includes comprehensive health checks: ### Cursor Rule Integration Project includes `.cursor/rules/always-plan-first.mdc` requiring understanding before coding. Always explain the task and confirm approach before implementation. +## Extended Documentation -## API Reference - -### Health & Status Endpoints -- **GET /health**: Basic application health check -- **GET /readiness**: Service dependency validation (MongoDB, Qdrant, etc.) -- **GET /api/metrics**: System metrics and debug tracker status (Admin only) -- **GET /api/processor/status**: Processor queue status and health (Admin only) -- **GET /api/processor/tasks**: All active processing tasks (Admin only) -- **GET /api/processor/tasks/{client_id}**: Processing task status for specific client (Admin only) - -### WebSocket Endpoints -- **WS /ws_pcm**: Primary audio streaming endpoint (Wyoming protocol + raw PCM fallback) -- **WS /ws**: Simple audio streaming endpoint (Opus packets + Wyoming audio-chunk events) - -### Memory & Conversation Debugging -- **GET /api/admin/memories**: All memories across all users with debug stats (Admin only) -- **GET /api/memories/unfiltered**: User's memories without filtering -- **GET /api/memories/search**: Semantic memory search with relevance scoring -- **GET /api/conversations**: User's conversations with transcripts -- **GET /api/conversations/{conversation_id}**: Specific conversation details -- **POST /api/conversations/{conversation_id}/reprocess-transcript**: Re-run transcript processing -- **POST /api/conversations/{conversation_id}/reprocess-memory**: Re-extract memories with different parameters -- **GET /api/conversations/{conversation_id}/versions**: Get all transcript and memory versions -- **POST /api/conversations/{conversation_id}/activate-transcript**: Switch to a different transcript version -- **POST /api/conversations/{conversation_id}/activate-memory**: Switch to a different memory version - -### Client Management -- **GET /api/clients/active**: Currently active WebSocket clients -- **GET /api/users**: List all users (Admin only) - -### File Processing -- **POST /api/process-audio-files**: Upload and process audio files (Admin only) - - Note: Processes files sequentially, may timeout for large files - - Client timeout: 5 minutes, Server processing: up to 3x audio duration + 60s - - Example usage: - ```bash - # Step 1: Read .env file for ADMIN_EMAIL and ADMIN_PASSWORD - # Step 2: Get auth token - # Step 3: Use token in file upload - curl -X POST \ - -H "Authorization: Bearer YOUR_TOKEN_HERE" \ - -F "files=@/path/to/audio.wav" \ - -F "device_name=test-upload" \ - http://localhost:8000/api/process-audio-files - ``` - -### Authentication -- **POST /auth/jwt/login**: Email-based login (returns JWT token) -- **GET /users/me**: Get current authenticated user -- **GET /api/auth/config**: Authentication configuration - -### Step-by-Step API Testing Guide - -When testing API endpoints that require authentication, follow these steps: - -#### Step 1: Read credentials from .env file -```bash -# Use the Read tool to view the .env file and identify credentials -# Look for: -# ADMIN_EMAIL=admin@example.com -# ADMIN_PASSWORD=your-password-here -``` - -#### Step 2: Get authentication token -```bash -curl -s -X POST \ - -H "Content-Type: application/x-www-form-urlencoded" \ - -d "username=admin@example.com&password=your-password-here" \ - http://localhost:8000/auth/jwt/login -``` -This returns: -```json -{"access_token":"eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...","token_type":"bearer"} -``` - -#### Step 3: Use the token in API calls -```bash -# Extract the token from the response above and use it: -curl -s -H "Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9..." \ - http://localhost:8000/api/conversations - -# For reprocessing endpoints: -curl -s -X POST \ - -H "Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9..." \ - -H "Content-Type: application/json" \ - http://localhost:8000/api/conversations/{conversation_id}/reprocess-transcript -``` - -**Important**: Always read the .env file first using the Read tool rather than using shell commands like `grep` or `cut`. This ensures you see the exact values and can copy them accurately. - -#### Step 4: Testing Reprocessing Endpoints -Once you have the auth token, you can test the reprocessing functionality: - -```bash -# Get list of conversations to find a conversation_id -curl -s -H "Authorization: Bearer YOUR_TOKEN" \ - http://localhost:8000/api/conversations - -# Test transcript reprocessing (uses conversation_id) -curl -s -X POST \ - -H "Authorization: Bearer YOUR_TOKEN" \ - -H "Content-Type: application/json" \ - http://localhost:8000/api/conversations/YOUR_CONVERSATION_ID/reprocess-transcript - -# Test memory reprocessing (uses conversation_id and transcript_version_id) -curl -s -X POST \ - -H "Authorization: Bearer YOUR_TOKEN" \ - -H "Content-Type: application/json" \ - -d '{"transcript_version_id": "VERSION_ID"}' \ - http://localhost:8000/api/conversations/YOUR_CONVERSATION_ID/reprocess-memory - -# Get transcript and memory versions -curl -s -H "Authorization: Bearer YOUR_TOKEN" \ - http://localhost:8000/api/conversations/YOUR_CONVERSATION_ID/versions - -# Activate a specific transcript version -curl -s -X POST \ - -H "Authorization: Bearer YOUR_TOKEN" \ - -H "Content-Type: application/json" \ - -d '{"transcript_version_id": "VERSION_ID"}' \ - http://localhost:8000/api/conversations/YOUR_CONVERSATION_ID/activate-transcript - -# Activate a specific memory version -curl -s -X POST \ - -H "Authorization: Bearer YOUR_TOKEN" \ - -H "Content-Type: application/json" \ - -d '{"memory_version_id": "VERSION_ID"}' \ - http://localhost:8000/api/conversations/YOUR_CONVERSATION_ID/activate-memory -``` - -### Development Reset Endpoints -Useful endpoints for resetting state during development: - -#### Data Cleanup -- **DELETE /api/admin/memory/delete-all**: Delete all memories for the current user -- **DELETE /api/memories/{memory_id}**: Delete a specific memory -- **DELETE /api/conversations/{conversation_id}**: Delete a specific conversation (keeps original audio file in audio_chunks) -- **DELETE /api/chat/sessions/{session_id}**: Delete a chat session and all its messages -- **DELETE /api/users/{user_id}**: Delete a user (Admin only) - - Optional query params: `delete_conversations=true`, `delete_memories=true` - -#### Quick Reset Commands -```bash -# Reset all data (development only) -cd backends/advanced -sudo rm -rf data/ - -# Reset Docker volumes -docker compose down -v -docker compose up --build -d -``` - - -## Speaker Recognition Service Features - -### Speaker Analysis & Visualization -The speaker recognition service now includes advanced analysis capabilities: - -#### Embedding Analysis (/speakers/analysis endpoint) -- **2D/3D Visualization**: Interactive embedding plots using UMAP, t-SNE, or PCA -- **Clustering Analysis**: Automatic clustering using DBSCAN or K-means -- **Speaker Similarity Detection**: Identifies speakers with similar embeddings -- **Quality Metrics**: Embedding separation quality and confidence scores -- **Interactive Controls**: Adjustable analysis parameters and visualization options - -Access via: `extras/speaker-recognition/webui` → Speakers → Embedding Analysis tab - -#### Live Inference Feature (/infer-live page) -Real-time speaker identification and transcription: -- **WebRTC Audio Capture**: Live microphone access with waveform visualization -- **Deepgram Streaming**: Real-time transcription with speaker diarization -- **Live Speaker ID**: Identifies enrolled speakers in real-time using internal service -- **Session Statistics**: Live metrics for words, speakers, and confidence scores -- **Configurable Settings**: Adjustable confidence thresholds and audio parameters - -Access via: `extras/speaker-recognition/webui` → Live Inference - -### Technical Implementation - -#### Backend (Python) -- **Analysis Utils**: `src/simple_speaker_recognition/utils/analysis.py` - - UMAP/t-SNE dimensionality reduction - - DBSCAN/K-means clustering - - Cosine similarity analysis - - Quality metrics calculation -- **API Endpoint**: `/speakers/analysis` - Returns processed embedding analysis -- **Dependencies**: Added `umap-learn` for dimensionality reduction - -#### Frontend (React/TypeScript) -- **EmbeddingPlot Component**: Interactive Plotly.js visualizations -- **LiveAudioCapture Component**: WebRTC audio recording with waveform -- **DeepgramStreaming Service**: WebSocket integration for real-time transcription -- **InferLive Page**: Complete live inference interface - -### Usage Instructions - -#### Setting up Live Inference -1. Navigate to Live Inference page -2. Configure Deepgram API key in settings -3. Adjust speaker identification settings (confidence threshold) -4. Start live session to begin real-time transcription and speaker ID - -**Technical Details:** -- **Audio Processing**: Uses browser's native sample rate (typically 44.1kHz or 48kHz) -- **Buffer Retention**: 120 seconds of audio for improved utterance capture -- **Real-time Updates**: Live transcription with speaker identification results - -#### Using Speaker Analysis -1. Go to Speakers page → Embedding Analysis tab -2. Select analysis method (UMAP, t-SNE, PCA) -3. Choose clustering algorithm (DBSCAN, K-means) -4. Adjust similarity threshold for speaker detection -5. View interactive plots and quality metrics - -### Deployment Notes -- Requires Docker rebuild to pick up new Python dependencies -- Frontend dependencies (Plotly.js) already included -- Live inference requires Deepgram API key for streaming transcription -- Speaker identification uses existing enrolled speakers from database - -### Live Inference Troubleshooting -- **"NaN:NaN" timestamps**: Fixed in recent updates, ensure you're using latest version -- **Poor speaker identification**: Try adjusting confidence threshold or re-enrolling speakers -- **Audio processing delays**: Check browser console for sample rate detection logs -- **Buffer overflow issues**: Extended to 120-second retention for better performance -- **"extraction_failed" errors**: Usually indicates audio buffer timing issues - check console logs for buffer availability - -## Distributed Self-Hosting Architecture - -Friend-Lite supports distributed deployment across multiple machines, allowing you to separate GPU-intensive services from lightweight backend components. This is ideal for scenarios where you have a dedicated GPU machine and want to run the main backend on a VPS or Raspberry Pi. - -### Architecture Patterns - -#### Single Machine (Default) -All services run on one machine using Docker Compose - ideal for development and simple deployments. - -#### Distributed GPU Setup -**GPU Machine (High-performance):** -- LLM services (Ollama with GPU acceleration) -- ASR services (Parakeet with GPU) -- Speaker recognition service -- Deepgram fallback can remain on backend machine - -**Backend Machine (Lightweight - VPS/RPi):** -- Friend-Lite backend (FastAPI) -- React WebUI -- MongoDB -- Qdrant vector database - -### Networking with Tailscale - -Tailscale VPN provides secure, encrypted networking between distributed services: - -**Benefits:** -- **Zero configuration networking**: Services discover each other automatically -- **Encrypted communication**: All inter-service traffic is encrypted -- **Firewall friendly**: Works behind NATs and firewalls -- **Access control**: Granular permissions for service access -- **CORS support**: Built-in support for Tailscale IP ranges (100.x.x.x) - -**Installation:** -```bash -# On each machine -curl -fsSL https://tailscale.com/install.sh | sh -sudo tailscale up -``` - -### Distributed Service Configuration - -#### GPU Machine Services -```bash -# .env on GPU machine -OLLAMA_BASE_URL=http://0.0.0.0:11434 # Expose to Tailscale network -SPEAKER_SERVICE_URL=http://0.0.0.0:8085 - -# Enable GPU acceleration for Ollama -docker run -d --gpus=all -p 11434:11434 ollama/ollama:latest -``` - -#### Backend Machine Configuration -```bash -# .env on backend machine -OLLAMA_BASE_URL=http://100.x.x.x:11434 # GPU machine Tailscale IP -SPEAKER_SERVICE_URL=http://100.x.x.x:8085 # GPU machine Tailscale IP - -# Parakeet ASR services can also be distributed (if using offline ASR) -# PARAKEET_ASR_URL=http://100.x.x.x:8767 - -# CORS automatically supports Tailscale IPs (no configuration needed) -``` - -#### Service URL Examples - -**Common remote service configurations:** -```bash -# LLM Processing (GPU machine) -OLLAMA_BASE_URL=http://100.64.1.100:11434 -OPENAI_BASE_URL=http://100.64.1.100:8080 # For vLLM/OpenAI-compatible APIs - -# Speech Recognition (GPU machine) -# PARAKEET_ASR_URL=http://100.64.1.100:8767 # If using Parakeet ASR -SPEAKER_SERVICE_URL=http://100.64.1.100:8085 - -# Database services (can be on separate machine) -MONGODB_URI=mongodb://100.64.1.200:27017 # Database name: friend-lite -QDRANT_BASE_URL=http://100.64.1.200:6333 -``` - -### Deployment Steps - -#### 1. Set up Tailscale on all machines -```bash -# Install and connect each machine to your Tailscale network -curl -fsSL https://tailscale.com/install.sh | sh -sudo tailscale up -``` - -#### 2. Deploy GPU services -```bash -# On GPU machine - start GPU-accelerated services -cd extras/asr-services && docker compose up parakeet -d -cd extras/speaker-recognition && docker compose up --build -d - -# Start Ollama with GPU support -docker run -d --gpus=all -p 11434:11434 \ - -v ollama:/root/.ollama \ - ollama/ollama:latest -``` - -#### 3. Configure backend machine -```bash -# Update .env with Tailscale IPs of GPU machine -OLLAMA_BASE_URL=http://[gpu-machine-tailscale-ip]:11434 -SPEAKER_SERVICE_URL=http://[gpu-machine-tailscale-ip]:8085 - -# Start lightweight backend services -docker compose up --build -d -``` - -#### 4. Verify connectivity -```bash -# Test service connectivity from backend machine -curl http://[gpu-machine-ip]:11434/api/tags # Ollama -curl http://[gpu-machine-ip]:8085/health # Speaker recognition -``` - -### Performance Considerations - -**Network Latency:** -- Tailscale adds minimal latency (typically <5ms between nodes) -- LLM inference: Network time negligible compared to GPU processing -- ASR streaming: Use local fallback for latency-sensitive applications - -**Bandwidth Usage:** -- Audio streaming: ~128kbps for Opus, ~512kbps for PCM -- LLM requests: Typically <1MB per conversation -- Memory embeddings: ~3KB per memory vector - -**Processing Time Expectations:** -- Transcription (Deepgram): 2-5 seconds for 4-minute audio -- Transcription (Parakeet): 5-10 seconds for 4-minute audio -- Memory extraction (OpenAI GPT-4o-mini): 30-40 seconds for typical conversation -- Memory extraction (Ollama local): 45-90 seconds depending on model and GPU -- Full pipeline (4-min audio): 40-60 seconds with cloud services, 60-120 seconds with local models - -### Security Best Practices - -**Tailscale Access Control:** -```json -{ - "acls": [ - { - "action": "accept", - "src": ["tag:backend"], - "dst": ["tag:gpu:11434", "tag:gpu:8085", "tag:gpu:8767"] - } - ], - "tagOwners": { - "tag:backend": ["your-email@example.com"], - "tag:gpu": ["your-email@example.com"] - } -} -``` - -**Service Isolation:** -- Run GPU services in containers with limited network access -- Use Tailscale subnet routing for additional security -- Monitor service access logs for unauthorized requests - -### Troubleshooting Distributed Setup - -**Debugging Commands:** -```bash -# Check Tailscale connectivity -tailscale ping [machine-name] -tailscale status - -# Test service endpoints -curl http://[tailscale-ip]:11434/api/tags -curl http://[tailscale-ip]:8085/health - -# Check Docker networks -docker network ls -docker ps --format "table {{.Names}}\t{{.Ports}}" -``` +For detailed technical documentation, see: +- **[@docs/wyoming-protocol.md](docs/wyoming-protocol.md)**: WebSocket communication protocol details +- **[@docs/memory-providers.md](docs/memory-providers.md)**: In-depth memory provider comparison and setup +- **[@docs/versioned-processing.md](docs/versioned-processing.md)**: Transcript and memory versioning details +- **[@docs/api-reference.md](docs/api-reference.md)**: Complete endpoint documentation with examples +- **[@docs/speaker-recognition.md](docs/speaker-recognition.md)**: Advanced analysis and live inference features +- **[@docs/distributed-deployment.md](docs/distributed-deployment.md)**: Multi-machine deployment with Tailscale ## Notes for Claude Check if the src/ is volume mounted. If not, do compose build so that code changes are reflected. Do not simply run `docker compose restart` as it will not rebuild the image. diff --git a/docs/api-reference.md b/docs/api-reference.md new file mode 100644 index 00000000..e287a2f7 --- /dev/null +++ b/docs/api-reference.md @@ -0,0 +1,151 @@ +# API Reference + +## Health & Status Endpoints +- **GET /health**: Basic application health check +- **GET /readiness**: Service dependency validation (MongoDB, Qdrant, etc.) +- **GET /api/metrics**: System metrics and debug tracker status (Admin only) +- **GET /api/processor/status**: Processor queue status and health (Admin only) +- **GET /api/processor/tasks**: All active processing tasks (Admin only) +- **GET /api/processor/tasks/{client_id}**: Processing task status for specific client (Admin only) + +## WebSocket Endpoints +- **WS /ws_pcm**: Primary audio streaming endpoint (Wyoming protocol + raw PCM fallback) +- **WS /ws**: Simple audio streaming endpoint (Opus packets + Wyoming audio-chunk events) + +## Memory & Conversation Debugging +- **GET /api/admin/memories**: All memories across all users with debug stats (Admin only) +- **GET /api/memories/unfiltered**: User's memories without filtering +- **GET /api/memories/search**: Semantic memory search with relevance scoring +- **GET /api/conversations**: User's conversations with transcripts +- **GET /api/conversations/{conversation_id}**: Specific conversation details +- **POST /api/conversations/{conversation_id}/reprocess-transcript**: Re-run transcript processing +- **POST /api/conversations/{conversation_id}/reprocess-memory**: Re-extract memories with different parameters +- **GET /api/conversations/{conversation_id}/versions**: Get all transcript and memory versions +- **POST /api/conversations/{conversation_id}/activate-transcript**: Switch to a different transcript version +- **POST /api/conversations/{conversation_id}/activate-memory**: Switch to a different memory version + +## Client Management +- **GET /api/clients/active**: Currently active WebSocket clients +- **GET /api/users**: List all users (Admin only) + +## File Processing +- **POST /api/process-audio-files**: Upload and process audio files (Admin only) + - Note: Processes files sequentially, may timeout for large files + - Client timeout: 5 minutes, Server processing: up to 3x audio duration + 60s + - Example usage: + ```bash + # Step 1: Read .env file for ADMIN_EMAIL and ADMIN_PASSWORD + # Step 2: Get auth token + # Step 3: Use token in file upload + curl -X POST \ + -H "Authorization: Bearer YOUR_TOKEN_HERE" \ + -F "files=@/path/to/audio.wav" \ + -F "device_name=test-upload" \ + http://localhost:8000/api/process-audio-files + ``` + +## Authentication +- **POST /auth/jwt/login**: Email-based login (returns JWT token) +- **GET /users/me**: Get current authenticated user +- **GET /api/auth/config**: Authentication configuration + +## Step-by-Step API Testing Guide + +When testing API endpoints that require authentication, follow these steps: + +### Step 1: Read credentials from .env file +```bash +# Use the Read tool to view the .env file and identify credentials +# Look for: +# ADMIN_EMAIL=admin@example.com +# ADMIN_PASSWORD=your-password-here +``` + +### Step 2: Get authentication token +```bash +curl -s -X POST \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "username=admin@example.com&password=your-password-here" \ + http://localhost:8000/auth/jwt/login +``` +This returns: +```json +{"access_token":"eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...","token_type":"bearer"} +``` + +### Step 3: Use the token in API calls +```bash +# Extract the token from the response above and use it: +curl -s -H "Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9..." \ + http://localhost:8000/api/conversations + +# For reprocessing endpoints: +curl -s -X POST \ + -H "Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9..." \ + -H "Content-Type: application/json" \ + http://localhost:8000/api/conversations/{conversation_id}/reprocess-transcript +``` + +**Important**: Always read the .env file first using the Read tool rather than using shell commands like `grep` or `cut`. This ensures you see the exact values and can copy them accurately. + +### Step 4: Testing Reprocessing Endpoints +Once you have the auth token, you can test the reprocessing functionality: + +```bash +# Get list of conversations to find a conversation_id +curl -s -H "Authorization: Bearer YOUR_TOKEN" \ + http://localhost:8000/api/conversations + +# Test transcript reprocessing (uses conversation_id) +curl -s -X POST \ + -H "Authorization: Bearer YOUR_TOKEN" \ + -H "Content-Type: application/json" \ + http://localhost:8000/api/conversations/YOUR_CONVERSATION_ID/reprocess-transcript + +# Test memory reprocessing (uses conversation_id and transcript_version_id) +curl -s -X POST \ + -H "Authorization: Bearer YOUR_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"transcript_version_id": "VERSION_ID"}' \ + http://localhost:8000/api/conversations/YOUR_CONVERSATION_ID/reprocess-memory + +# Get transcript and memory versions +curl -s -H "Authorization: Bearer YOUR_TOKEN" \ + http://localhost:8000/api/conversations/YOUR_CONVERSATION_ID/versions + +# Activate a specific transcript version +curl -s -X POST \ + -H "Authorization: Bearer YOUR_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"transcript_version_id": "VERSION_ID"}' \ + http://localhost:8000/api/conversations/YOUR_CONVERSATION_ID/activate-transcript + +# Activate a specific memory version +curl -s -X POST \ + -H "Authorization: Bearer YOUR_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"memory_version_id": "VERSION_ID"}' \ + http://localhost:8000/api/conversations/YOUR_CONVERSATION_ID/activate-memory +``` + +## Development Reset Endpoints +Useful endpoints for resetting state during development: + +### Data Cleanup +- **DELETE /api/admin/memory/delete-all**: Delete all memories for the current user +- **DELETE /api/memories/{memory_id}**: Delete a specific memory +- **DELETE /api/conversations/{conversation_id}**: Delete a specific conversation (keeps original audio file in audio_chunks) +- **DELETE /api/chat/sessions/{session_id}**: Delete a chat session and all its messages +- **DELETE /api/users/{user_id}**: Delete a user (Admin only) + - Optional query params: `delete_conversations=true`, `delete_memories=true` + +### Quick Reset Commands +```bash +# Reset all data (development only) +cd backends/advanced +sudo rm -rf data/ + +# Reset Docker volumes +docker compose down -v +docker compose up --build -d +``` \ No newline at end of file diff --git a/docs/distributed-deployment.md b/docs/distributed-deployment.md new file mode 100644 index 00000000..e6825bb0 --- /dev/null +++ b/docs/distributed-deployment.md @@ -0,0 +1,178 @@ +# Distributed Self-Hosting Architecture + +Friend-Lite supports distributed deployment across multiple machines, allowing you to separate GPU-intensive services from lightweight backend components. This is ideal for scenarios where you have a dedicated GPU machine and want to run the main backend on a VPS or Raspberry Pi. + +## Architecture Patterns + +### Single Machine (Default) +All services run on one machine using Docker Compose - ideal for development and simple deployments. + +### Distributed GPU Setup +**GPU Machine (High-performance):** +- LLM services (Ollama with GPU acceleration) +- ASR services (Parakeet with GPU) +- Speaker recognition service +- Deepgram fallback can remain on backend machine + +**Backend Machine (Lightweight - VPS/RPi):** +- Friend-Lite backend (FastAPI) +- React WebUI +- MongoDB +- Qdrant vector database + +## Networking with Tailscale + +Tailscale VPN provides secure, encrypted networking between distributed services: + +**Benefits:** +- **Zero configuration networking**: Services discover each other automatically +- **Encrypted communication**: All inter-service traffic is encrypted +- **Firewall friendly**: Works behind NATs and firewalls +- **Access control**: Granular permissions for service access +- **CORS support**: Built-in support for Tailscale IP ranges (100.x.x.x) + +**Installation:** +```bash +# On each machine +curl -fsSL https://tailscale.com/install.sh | sh +sudo tailscale up +``` + +## Distributed Service Configuration + +### GPU Machine Services +```bash +# .env on GPU machine +OLLAMA_BASE_URL=http://0.0.0.0:11434 # Expose to Tailscale network +SPEAKER_SERVICE_URL=http://0.0.0.0:8085 + +# Enable GPU acceleration for Ollama +docker run -d --gpus=all -p 11434:11434 ollama/ollama:latest +``` + +### Backend Machine Configuration +```bash +# .env on backend machine +OLLAMA_BASE_URL=http://100.x.x.x:11434 # GPU machine Tailscale IP +SPEAKER_SERVICE_URL=http://100.x.x.x:8085 # GPU machine Tailscale IP + +# Parakeet ASR services can also be distributed (if using offline ASR) +# PARAKEET_ASR_URL=http://100.x.x.x:8767 + +# CORS automatically supports Tailscale IPs (no configuration needed) +``` + +### Service URL Examples + +**Common remote service configurations:** +```bash +# LLM Processing (GPU machine) +OLLAMA_BASE_URL=http://100.64.1.100:11434 +OPENAI_BASE_URL=http://100.64.1.100:8080 # For vLLM/OpenAI-compatible APIs + +# Speech Recognition (GPU machine) +# PARAKEET_ASR_URL=http://100.64.1.100:8767 # If using Parakeet ASR +SPEAKER_SERVICE_URL=http://100.64.1.100:8085 + +# Database services (can be on separate machine) +MONGODB_URI=mongodb://100.64.1.200:27017 # Database name: friend-lite +QDRANT_BASE_URL=http://100.64.1.200:6333 +``` + +## Deployment Steps + +### 1. Set up Tailscale on all machines +```bash +# Install and connect each machine to your Tailscale network +curl -fsSL https://tailscale.com/install.sh | sh +sudo tailscale up +``` + +### 2. Deploy GPU services +```bash +# On GPU machine - start GPU-accelerated services +cd extras/asr-services && docker compose up parakeet -d +cd extras/speaker-recognition && docker compose up --build -d + +# Start Ollama with GPU support +docker run -d --gpus=all -p 11434:11434 \ + -v ollama:/root/.ollama \ + ollama/ollama:latest +``` + +### 3. Configure backend machine +```bash +# Update .env with Tailscale IPs of GPU machine +OLLAMA_BASE_URL=http://[gpu-machine-tailscale-ip]:11434 +SPEAKER_SERVICE_URL=http://[gpu-machine-tailscale-ip]:8085 + +# Start lightweight backend services +docker compose up --build -d +``` + +### 4. Verify connectivity +```bash +# Test service connectivity from backend machine +curl http://[gpu-machine-ip]:11434/api/tags # Ollama +curl http://[gpu-machine-ip]:8085/health # Speaker recognition +``` + +## Performance Considerations + +**Network Latency:** +- Tailscale adds minimal latency (typically <5ms between nodes) +- LLM inference: Network time negligible compared to GPU processing +- ASR streaming: Use local fallback for latency-sensitive applications + +**Bandwidth Usage:** +- Audio streaming: ~128kbps for Opus, ~512kbps for PCM +- LLM requests: Typically <1MB per conversation +- Memory embeddings: ~3KB per memory vector + +**Processing Time Expectations:** +- Transcription (Deepgram): 2-5 seconds for 4-minute audio +- Transcription (Parakeet): 5-10 seconds for 4-minute audio +- Memory extraction (OpenAI GPT-4o-mini): 30-40 seconds for typical conversation +- Memory extraction (Ollama local): 45-90 seconds depending on model and GPU +- Full pipeline (4-min audio): 40-60 seconds with cloud services, 60-120 seconds with local models + +## Security Best Practices + +**Tailscale Access Control:** +```json +{ + "acls": [ + { + "action": "accept", + "src": ["tag:backend"], + "dst": ["tag:gpu:11434", "tag:gpu:8085", "tag:gpu:8767"] + } + ], + "tagOwners": { + "tag:backend": ["your-email@example.com"], + "tag:gpu": ["your-email@example.com"] + } +} +``` + +**Service Isolation:** +- Run GPU services in containers with limited network access +- Use Tailscale subnet routing for additional security +- Monitor service access logs for unauthorized requests + +## Troubleshooting Distributed Setup + +**Debugging Commands:** +```bash +# Check Tailscale connectivity +tailscale ping [machine-name] +tailscale status + +# Test service endpoints +curl http://[tailscale-ip]:11434/api/tags +curl http://[tailscale-ip]:8085/health + +# Check Docker networks +docker network ls +docker ps --format "table {{.Names}}\t{{.Ports}}" +``` \ No newline at end of file diff --git a/Docs/features.md b/docs/features.md similarity index 100% rename from Docs/features.md rename to docs/features.md diff --git a/Docs/init-system.md b/docs/init-system.md similarity index 100% rename from Docs/init-system.md rename to docs/init-system.md diff --git a/docs/memory-providers.md b/docs/memory-providers.md new file mode 100644 index 00000000..908236a2 --- /dev/null +++ b/docs/memory-providers.md @@ -0,0 +1,182 @@ +# Memory System Architecture + +## Overview +Friend-Lite supports two pluggable memory backends that can be selected via configuration: + +## 1. Friend-Lite Memory Provider (`friend_lite`) +The sophisticated in-house memory implementation with full control and customization: + +### Features +- Custom LLM-powered memory extraction with enhanced prompts +- Individual fact storage (no JSON blobs) +- Smart deduplication algorithms +- Intelligent memory updates (ADD/UPDATE/DELETE decisions) +- **Semantic search** with relevance threshold filtering +- **Memory count API** with total count tracking from native Qdrant +- Direct Qdrant vector storage with accurate similarity scoring +- Custom memory prompts and processing +- No external dependencies + +### Architecture Flow +1. **Audio Input** → Transcription via Deepgram/Parakeet +2. **Memory Extraction** → LLM processes transcript using custom prompts +3. **Fact Parsing** → XML/JSON parsing into individual memory entries +4. **Deduplication** → Smart algorithms prevent duplicate memories +5. **Vector Storage** → Direct Qdrant storage with embeddings +6. **Memory Updates** → LLM-driven action proposals (ADD/UPDATE/DELETE) + +## 2. OpenMemory MCP Provider (`openmemory_mcp`) +Thin client that delegates all memory processing to external OpenMemory MCP server: + +### Features +- Professional memory extraction (handled by OpenMemory) +- Battle-tested deduplication (handled by OpenMemory) +- Semantic vector search (handled by OpenMemory) +- ACL-based user isolation (handled by OpenMemory) +- Cross-client compatibility (Claude Desktop, Cursor, Windsurf) +- Web UI for memory management at http://localhost:8765 + +### Architecture Flow +1. **Audio Input** → Transcription via Deepgram/Parakeet +2. **MCP Delegation** → Send enriched transcript to OpenMemory MCP server +3. **External Processing** → OpenMemory handles extraction, deduplication, storage +4. **Result Mapping** → Convert MCP results to Friend-Lite MemoryEntry format +5. **Client Management** → Automatic user context switching via MCP client + +## Memory Provider Comparison + +| Feature | Friend-Lite | OpenMemory MCP | +|---------|-------------|----------------| +| **Processing** | Custom LLM extraction | Delegates to OpenMemory | +| **Deduplication** | Custom algorithms | OpenMemory handles | +| **Vector Storage** | Direct Qdrant | OpenMemory handles | +| **Search Features** | Semantic search with threshold filtering | Semantic search with relevance scoring | +| **Memory Count** | Native Qdrant count API | Varies by OpenMemory support | +| **Dependencies** | Qdrant + MongoDB | External OpenMemory server | +| **Customization** | Full control | Limited to OpenMemory features | +| **Cross-client** | Friend-Lite only | Works with Claude Desktop, Cursor, etc | +| **Web UI** | Friend-Lite WebUI with advanced search | OpenMemory UI + Friend-Lite WebUI | +| **Memory Format** | Individual facts | OpenMemory format | +| **Setup Complexity** | Medium | High (external server required) | + +## Switching Memory Providers + +You can switch providers by changing the `MEMORY_PROVIDER` environment variable: + +```bash +# Switch to OpenMemory MCP +echo "MEMORY_PROVIDER=openmemory_mcp" >> .env + +# Switch back to Friend-Lite +echo "MEMORY_PROVIDER=friend_lite" >> .env +``` + +**Note:** Existing memories are not automatically migrated between providers. Each provider maintains its own memory storage. + +## OpenMemory MCP Setup + +To use the OpenMemory MCP provider: + +```bash +# 1. Start external OpenMemory MCP server +cd extras/openmemory-mcp +docker compose up -d + +# 2. Configure Friend-Lite to use OpenMemory MCP +cd backends/advanced +echo "MEMORY_PROVIDER=openmemory_mcp" >> .env + +# 3. Start Friend-Lite backend +docker compose up --build -d +``` + +## OpenMemory MCP Interface Patterns + +**Important**: OpenMemory MCP stores memories **per-app**, not globally. Understanding this architecture is critical for proper integration. + +### App-Based Storage Architecture +- All memories are stored under specific "apps" (namespaces) +- Generic endpoints (`/api/v1/memories/`) return empty results +- App-specific endpoints (`/api/v1/apps/{app_id}/memories`) contain the actual memories + +### Hardcoded Values and Configuration +```bash +# Default app name (configurable via OPENMEMORY_CLIENT_NAME) +Default: "friend_lite" + +# Hardcoded metadata (NOT configurable) +"source": "friend_lite" # Always hardcoded in Friend-Lite + +# User ID for OpenMemory MCP server +OPENMEMORY_USER_ID=openmemory # Configurable +``` + +### API Interface Pattern +```python +# 1. App Discovery - Find app by client_name +GET /api/v1/apps/ +# Response: {"apps": [{"id": "uuid", "name": "friend_lite", ...}]} + +# 2. Memory Creation - Uses generic endpoint but assigns to app +POST /api/v1/memories/ +{ + "user_id": "openmemory", + "text": "memory content", + "app": "friend_lite", # Uses OPENMEMORY_CLIENT_NAME + "metadata": { + "source": "friend_lite", # Hardcoded + "client": "friend_lite" # Uses OPENMEMORY_CLIENT_NAME + } +} + +# 3. Memory Retrieval - Must use app-specific endpoint +GET /api/v1/apps/{app_id}/memories?user_id=openmemory&page=1&size=10 + +# 4. Memory Search - Must use app-specific endpoint with search_query +GET /api/v1/apps/{app_id}/memories?user_id=openmemory&search_query=keyword&page=1&size=10 +``` + +### Friend-Lite Integration Flow +1. **App Discovery**: Query `/api/v1/apps/` to find app matching `OPENMEMORY_CLIENT_NAME` +2. **Fallback**: If client app not found, use first available app +3. **Operations**: All memory operations use the app-specific endpoints with discovered `app_id` + +### Testing OpenMemory MCP Integration +```bash +# Configure .env file with OpenMemory MCP settings +cp .env.template .env +# Edit .env to set MEMORY_PROVIDER=openmemory_mcp and configure OPENMEMORY_* variables + +# Start OpenMemory MCP server +cd extras/openmemory-mcp && docker compose up -d + +# Run integration tests (reads configuration from .env file) +cd backends/advanced && ./run-test.sh + +# Manual testing - Check app structure +curl -s "http://localhost:8765/api/v1/apps/" | jq + +# Test memory creation +curl -X POST "http://localhost:8765/api/v1/memories/" \ + -H "Content-Type: application/json" \ + -d '{"user_id": "openmemory", "text": "test memory", "app": "friend_lite"}' + +# Retrieve memories (replace app_id with actual ID from apps endpoint) +curl -s "http://localhost:8765/api/v1/apps/{app_id}/memories?user_id=openmemory" | jq +``` + +## When to Use Each Provider + +### Use Friend-Lite when: +- You want full control over memory processing +- You need custom memory extraction logic +- You prefer fewer external dependencies +- You want to customize memory prompts and algorithms +- You need individual fact-based memory storage + +### Use OpenMemory MCP when: +- You want professional, battle-tested memory processing +- You need cross-client compatibility (Claude Desktop, Cursor, etc.) +- You prefer to leverage external expertise rather than maintain custom logic +- You want access to OpenMemory's web interface +- You're already using OpenMemory in other tools \ No newline at end of file diff --git a/Docs/ports-and-access.md b/docs/ports-and-access.md similarity index 100% rename from Docs/ports-and-access.md rename to docs/ports-and-access.md diff --git a/docs/speaker-recognition.md b/docs/speaker-recognition.md new file mode 100644 index 00000000..63217f3f --- /dev/null +++ b/docs/speaker-recognition.md @@ -0,0 +1,73 @@ +# Speaker Recognition Service Features + +## Speaker Analysis & Visualization +The speaker recognition service now includes advanced analysis capabilities: + +### Embedding Analysis (/speakers/analysis endpoint) +- **2D/3D Visualization**: Interactive embedding plots using UMAP, t-SNE, or PCA +- **Clustering Analysis**: Automatic clustering using DBSCAN or K-means +- **Speaker Similarity Detection**: Identifies speakers with similar embeddings +- **Quality Metrics**: Embedding separation quality and confidence scores +- **Interactive Controls**: Adjustable analysis parameters and visualization options + +Access via: `extras/speaker-recognition/webui` → Speakers → Embedding Analysis tab + +### Live Inference Feature (/infer-live page) +Real-time speaker identification and transcription: +- **WebRTC Audio Capture**: Live microphone access with waveform visualization +- **Deepgram Streaming**: Real-time transcription with speaker diarization +- **Live Speaker ID**: Identifies enrolled speakers in real-time using internal service +- **Session Statistics**: Live metrics for words, speakers, and confidence scores +- **Configurable Settings**: Adjustable confidence thresholds and audio parameters + +Access via: `extras/speaker-recognition/webui` → Live Inference + +## Technical Implementation + +### Backend (Python) +- **Analysis Utils**: `src/simple_speaker_recognition/utils/analysis.py` + - UMAP/t-SNE dimensionality reduction + - DBSCAN/K-means clustering + - Cosine similarity analysis + - Quality metrics calculation +- **API Endpoint**: `/speakers/analysis` - Returns processed embedding analysis +- **Dependencies**: Added `umap-learn` for dimensionality reduction + +### Frontend (React/TypeScript) +- **EmbeddingPlot Component**: Interactive Plotly.js visualizations +- **LiveAudioCapture Component**: WebRTC audio recording with waveform +- **DeepgramStreaming Service**: WebSocket integration for real-time transcription +- **InferLive Page**: Complete live inference interface + +## Usage Instructions + +### Setting up Live Inference +1. Navigate to Live Inference page +2. Configure Deepgram API key in settings +3. Adjust speaker identification settings (confidence threshold) +4. Start live session to begin real-time transcription and speaker ID + +**Technical Details:** +- **Audio Processing**: Uses browser's native sample rate (typically 44.1kHz or 48kHz) +- **Buffer Retention**: 120 seconds of audio for improved utterance capture +- **Real-time Updates**: Live transcription with speaker identification results + +### Using Speaker Analysis +1. Go to Speakers page → Embedding Analysis tab +2. Select analysis method (UMAP, t-SNE, PCA) +3. Choose clustering algorithm (DBSCAN, K-means) +4. Adjust similarity threshold for speaker detection +5. View interactive plots and quality metrics + +## Deployment Notes +- Requires Docker rebuild to pick up new Python dependencies +- Frontend dependencies (Plotly.js) already included +- Live inference requires Deepgram API key for streaming transcription +- Speaker identification uses existing enrolled speakers from database + +## Live Inference Troubleshooting +- **"NaN:NaN" timestamps**: Fixed in recent updates, ensure you're using latest version +- **Poor speaker identification**: Try adjusting confidence threshold or re-enrolling speakers +- **Audio processing delays**: Check browser console for sample rate detection logs +- **Buffer overflow issues**: Extended to 120-second retention for better performance +- **"extraction_failed" errors**: Usually indicates audio buffer timing issues - check console logs for buffer availability \ No newline at end of file diff --git a/docs/versioned-processing.md b/docs/versioned-processing.md new file mode 100644 index 00000000..849ac377 --- /dev/null +++ b/docs/versioned-processing.md @@ -0,0 +1,166 @@ +# Versioned Processing System + +## Overview + +Friend-Lite implements a comprehensive versioning system for both transcript and memory processing, allowing multiple processing attempts with different providers, models, or settings while maintaining a clean user experience. + +## Version Data Structure + +### Transcript Versions +```json +{ + "transcript_versions": [ + { + "version_id": "uuid", + "transcript": "processed text", + "segments": [...], + "provider": "deepgram|mistral|parakeet", + "model": "nova-3|voxtral-mini-2507", + "created_at": "2025-01-15T10:30:00Z", + "processing_time_seconds": 12.5, + "metadata": { + "confidence_scores": [...], + "speaker_diarization": true + } + } + ], + "active_transcript_version": "uuid" +} +``` + +### Memory Versions +```json +{ + "memory_versions": [ + { + "version_id": "uuid", + "memory_count": 5, + "transcript_version_id": "uuid", + "provider": "friend_lite|openmemory_mcp", + "model": "gpt-4o-mini|ollama-llama3", + "created_at": "2025-01-15T10:32:00Z", + "processing_time_seconds": 45.2, + "metadata": { + "prompt_version": "v2.1", + "extraction_quality": "high" + } + } + ], + "active_memory_version": "uuid" +} +``` + +## Database Schema Details + +### Collections Overview +- **`audio_chunks`**: All audio sessions by `audio_uuid` (always created) +- **`conversations`**: Speech-detected conversations by `conversation_id` (created conditionally) +- **`users`**: User accounts and authentication data + +### Speech-Driven Schema +```javascript +// audio_chunks collection (always created) +{ + "_id": ObjectId, + "audio_uuid": "uuid", // Primary identifier + "user_id": ObjectId, + "client_id": "user_suffix-device_name", + "audio_file_path": "/path/to/audio.wav", + "created_at": ISODate, + "transcript": "fallback transcript", // For non-speech audio + "segments": [...], // Speaker segments + "has_speech": boolean, // Speech detection result + "speech_analysis": {...}, // Detection metadata + "conversation_id": "conv_id" | null // Link to conversations collection +} + +// conversations collection (speech-detected only) +{ + "_id": ObjectId, + "conversation_id": "conv_uuid", // Primary identifier for user-facing operations + "audio_uuid": "audio_uuid", // Link to audio_chunks + "user_id": ObjectId, + "client_id": "user_suffix-device_name", + "created_at": ISODate, + + // Versioned Transcript System + "transcript_versions": [ + { + "version_id": "uuid", + "transcript": "text content", + "segments": [...], // Speaker diarization + "provider": "deepgram|mistral|parakeet", + "model": "nova-3|voxtral-mini-2507", + "created_at": ISODate, + "processing_time_seconds": 12.5, + "metadata": {...} + } + ], + "active_transcript_version": "uuid", // Points to current version + + // Versioned Memory System + "memory_versions": [ + { + "version_id": "uuid", + "memory_count": 5, + "transcript_version_id": "uuid", // Which transcript was used + "provider": "friend_lite|openmemory_mcp", + "model": "gpt-4o-mini|ollama-llama3", + "created_at": ISODate, + "processing_time_seconds": 45.2, + "metadata": {...} + } + ], + "active_memory_version": "uuid", // Points to current version + + // Legacy Fields (auto-populated from active versions) + "transcript": "text", // From active_transcript_version + "segments": [...], // From active_transcript_version + "memories": [...], // From active_memory_version + "memory_count": 5 // From active_memory_version +} +``` + +## Reprocessing Workflows + +### Transcript Reprocessing +1. Trigger via API: `POST /api/conversations/{conversation_id}/reprocess-transcript` +2. System creates new transcript version with different provider/model +3. New version added to `transcript_versions` array +4. User can activate any version via `activate-transcript` endpoint +5. Legacy `transcript` field automatically updated from active version + +### Memory Reprocessing +1. Trigger via API: `POST /api/conversations/{conversation_id}/reprocess-memory` +2. Specify which transcript version to use as input +3. System creates new memory version using specified transcript +4. New version added to `memory_versions` array +5. User can activate any version via `activate-memory` endpoint +6. Legacy `memories` field automatically updated from active version + +## Legacy Field Compatibility + +### Automatic Population +- `transcript`: Auto-populated from active transcript version +- `segments`: Auto-populated from active transcript version +- `memories`: Auto-populated from active memory version +- `memory_count`: Auto-populated from active memory version + +### Backward Compatibility +- Existing API clients continue working without modification +- WebUI displays active versions by default +- Advanced users can access version history and switch between versions + +## Data Consistency +- All reprocessing operations use `conversation_id` (not `audio_uuid`) +- DateTime objects stored as ISO strings for MongoDB/JSON compatibility +- Legacy field support ensures existing integrations continue working + +## Key Architecture Benefits +- **Clean Separation**: Raw audio storage vs user-facing conversations +- **Speech Filtering**: Only meaningful conversations appear in UI +- **Version History**: Complete audit trail of processing attempts +- **Backward Compatibility**: Legacy fields ensure existing code works +- **Reprocessing Support**: Easy to re-run with different providers/models +- **Service Decoupling**: Conversation creation independent of memory processing +- **Error Isolation**: Memory service failures don't affect conversation storage \ No newline at end of file diff --git a/docs/wyoming-protocol.md b/docs/wyoming-protocol.md new file mode 100644 index 00000000..b9f4e59c --- /dev/null +++ b/docs/wyoming-protocol.md @@ -0,0 +1,79 @@ +# Wyoming Protocol Implementation + +## Overview +The system uses Wyoming protocol for WebSocket communication between mobile apps and backends. Wyoming is a peer-to-peer protocol for voice assistants that combines JSONL headers with binary audio payloads. + +## Protocol Format +``` +{JSON_HEADER}\n + +``` + +## Supported Events + +### Audio Session Events +- **audio-start**: Signals the beginning of an audio recording session + ```json + {"type": "audio-start", "data": {"rate": 16000, "width": 2, "channels": 1}, "payload_length": null} + ``` + +- **audio-chunk**: Contains raw audio data with format metadata + ```json + {"type": "audio-chunk", "data": {"rate": 16000, "width": 2, "channels": 1}, "payload_length": 320} + <320 bytes of PCM/Opus audio data> + ``` + +- **audio-stop**: Signals the end of an audio recording session + ```json + {"type": "audio-stop", "data": {"timestamp": 1234567890}, "payload_length": null} + ``` + +## Backend Implementation + +### Advanced Backend (`/ws_pcm`) +- **Full Wyoming Protocol Support**: Parses all Wyoming events for session management +- **Session Tracking**: Only processes audio chunks when session is active (after audio-start) +- **Conversation Boundaries**: Uses audio-start/stop events to define conversation segments +- **Backward Compatibility**: Fallback to raw binary audio for older clients + +### Simple Backend (`/ws`) +- **Minimal Wyoming Support**: Parses audio-chunk events, ignores others +- **Opus Processing**: Handles Opus-encoded audio chunks from Wyoming protocol +- **Graceful Degradation**: Falls back to raw Opus packets for compatibility + +## Mobile App Integration + +Mobile apps should implement Wyoming protocol for proper session management: + +```javascript +// Start audio session +const audioStart = { + type: "audio-start", + data: { rate: 16000, width: 2, channels: 1 }, + payload_length: null +}; +websocket.send(JSON.stringify(audioStart) + '\n'); + +// Send audio chunks +const audioChunk = { + type: "audio-chunk", + data: { rate: 16000, width: 2, channels: 1 }, + payload_length: audioData.byteLength +}; +websocket.send(JSON.stringify(audioChunk) + '\n'); +websocket.send(audioData); + +// End audio session +const audioStop = { + type: "audio-stop", + data: { timestamp: Date.now() }, + payload_length: null +}; +websocket.send(JSON.stringify(audioStop) + '\n'); +``` + +## Benefits +- **Clear Session Boundaries**: No timeout-based conversation detection needed +- **Structured Communication**: Consistent protocol across all audio streaming +- **Future Extensibility**: Room for additional event types (pause, resume, metadata) +- **Backward Compatibility**: Works with existing raw audio streaming clients \ No newline at end of file From 38a4ba1a7f59c236fd6073682f5f6d738402ebca Mon Sep 17 00:00:00 2001 From: Ankush Malaker <43288948+AnkushMalaker@users.noreply.github.com> Date: Mon, 22 Sep 2025 03:42:11 +0000 Subject: [PATCH 6/7] update --- .../conversation_manager.py | 277 +++++++++++++++- .../advanced/src/advanced_omi_backend/main.py | 4 - .../transcript_coordinator.py | 172 ---------- .../src/advanced_omi_backend/transcription.py | 297 ++---------------- 4 files changed, 304 insertions(+), 446 deletions(-) delete mode 100644 backends/advanced/src/advanced_omi_backend/transcript_coordinator.py diff --git a/backends/advanced/src/advanced_omi_backend/conversation_manager.py b/backends/advanced/src/advanced_omi_backend/conversation_manager.py index 92b1ee0b..a117aacb 100644 --- a/backends/advanced/src/advanced_omi_backend/conversation_manager.py +++ b/backends/advanced/src/advanced_omi_backend/conversation_manager.py @@ -6,12 +6,13 @@ """ import logging -from typing import Optional +import uuid +from datetime import UTC, datetime +from typing import Dict, List, Optional -from advanced_omi_backend.processors import ( - get_processor_manager, -) -from advanced_omi_backend.transcript_coordinator import get_transcript_coordinator +from advanced_omi_backend.database import ConversationsRepository, conversations_col +from advanced_omi_backend.llm_client import async_generate +from advanced_omi_backend.processors import get_processor_manager audio_logger = logging.getLogger("audio") @@ -24,9 +25,61 @@ class ConversationManager: """ def __init__(self): - self.coordinator = get_transcript_coordinator() audio_logger.info("ConversationManager initialized") + async def create_conversation(self, audio_uuid: str, transcript_data: dict, speech_analysis: dict, chunk_repo): + """Create conversation entry for detected speech.""" + try: + # Get audio session info from audio_chunks + audio_session = await chunk_repo.get_chunk(audio_uuid) + if not audio_session: + audio_logger.error(f"No audio session found for {audio_uuid}") + return None + + # Create conversation data (title and summary will be generated after speaker recognition) + conversation_id = str(uuid.uuid4()) + conversation_data = { + "conversation_id": conversation_id, + "audio_uuid": audio_uuid, + "user_id": audio_session["user_id"], + "client_id": audio_session["client_id"], + "title": "Processing...", # Placeholder - will be updated after speaker recognition + "summary": "Processing...", # Placeholder - will be updated after speaker recognition + + # Versioned system (source of truth) + "transcript_versions": [], + "active_transcript_version": None, + "memory_versions": [], + "active_memory_version": None, + + # Legacy compatibility fields (auto-populated on read) + # Note: These will be auto-populated from active versions when retrieved + + "duration_seconds": speech_analysis.get("duration", 0.0), + "speech_start_time": speech_analysis.get("speech_start", 0.0), + "speech_end_time": speech_analysis.get("speech_end", 0.0), + "speaker_names": {}, + "action_items": [], + "created_at": datetime.now(UTC), + "updated_at": datetime.now(UTC), + "session_start": datetime.fromtimestamp(audio_session.get("timestamp", 0) / 1000, tz=UTC), + "session_end": datetime.now(UTC), + } + + # Create conversation in conversations collection + conversations_repo = ConversationsRepository(conversations_col) + await conversations_repo.create_conversation(conversation_data) + + # Mark audio_chunks as having speech and link to conversation + await chunk_repo.mark_conversation_created(audio_uuid, conversation_id) + + audio_logger.info(f"✅ Created conversation {conversation_id} for audio {audio_uuid} (speech detected)") + return conversation_id + + except Exception as e: + audio_logger.error(f"Failed to create conversation for {audio_uuid}: {e}", exc_info=True) + return None + async def close_conversation( self, client_id: str, @@ -84,6 +137,218 @@ async def close_conversation( audio_logger.error(f"❌ Error closing conversation {audio_uuid}: {e}", exc_info=True) return False + async def generate_title( + self, + *, + speaker_segments: Optional[List[Dict]] = None, + text: Optional[str] = None + ) -> str: + """Generate conversation title with speaker-aware formatting when available. + + Args: + speaker_segments: List of segments with speaker info (preferred) + text: Raw conversation text (fallback) + + Returns: + Generated title (max 40 characters) + """ + # Validation + if not speaker_segments and not text: + return "Conversation" + + # Format conversation text (unified approach) + if speaker_segments: + conversation_text = self._format_segments_with_speakers(speaker_segments[:10]) + context = "this conversation with speakers" + include_speakers_instruction = "- Include speaker names when relevant" + else: + conversation_text = text[:500] if text else "" + context = "this conversation transcript" + include_speakers_instruction = "- Focus on main topic" + + if not conversation_text.strip(): + return "Conversation" + + try: + # Unified prompt (consistent constraints) + prompt = f"Generate a concise, descriptive title (max 40 characters) for {context}:"\ + + f"{conversation_text}"\ + + "Rules:\n"\ + + "- Maximum 40 characters\n"\ + + f"{include_speakers_instruction}\n"\ + + "- Capture the main topic\n"\ + + "- Be specific and informative\n"\ + + "Title:" + + title = await async_generate(prompt, temperature=0.3) + return self._clean_and_truncate_title(title) + + except Exception as e: + audio_logger.warning(f"Failed to generate LLM title: {e}") + # Fallback to simple title generation + words = conversation_text.split()[:6] + title = " ".join(words) + return title[:40] + "..." if len(title) > 40 else title or "Conversation" + + async def generate_summary( + self, + *, + speaker_segments: Optional[List[Dict]] = None, + text: Optional[str] = None + ) -> str: + """Generate conversation summary with speaker-aware formatting when available. + + Args: + speaker_segments: List of segments with speaker info (preferred) + text: Raw conversation text (fallback) + + Returns: + Generated summary (max 120 characters) + """ + # Validation + if not speaker_segments and not text: + return "No content" + + # Format conversation text (unified approach) + if speaker_segments: + conversation_text = self._format_segments_with_speakers(speaker_segments) + context = "this conversation with speakers" + include_speakers_instruction = "- Include speaker names when relevant (e.g., \"John discusses X with Sarah\")" + else: + conversation_text = text[:1000] if text else "" + context = "this conversation transcript" + include_speakers_instruction = "- Focus on key topics and outcomes" + + if not conversation_text.strip(): + return "No content" + + try: + # Unified prompt (consistent constraints) + prompt = f"Generate a brief, informative summary (1-2 sentences, max 120 characters) for {context}:"\ + + f"\n\n\"{conversation_text}\"\n\n"\ + + "Rules:\n"\ + + "- Maximum 120 characters\n"\ + + "- 1-2 complete sentences\n"\ + + f"{include_speakers_instruction}\n"\ + + "- Capture key topics and outcomes\n"\ + + "- Use present tense\n"\ + + "- Be specific and informative\n\n"\ + + "Summary:" + + summary = await async_generate(prompt, temperature=0.3) + return self._clean_and_truncate_summary(summary) + + except Exception as e: + audio_logger.warning(f"Failed to generate LLM summary: {e}") + # Fallback to simple summary generation + return conversation_text[:120] + "..." if len(conversation_text) > 120 else conversation_text or "No content" + + def _format_segments_with_speakers(self, segments: List[Dict]) -> str: + """Helper to format segments with speaker names.""" + conversation_text = "" + for segment in segments: + speaker = segment.get("speaker", "") + text = segment.get("text", "").strip() + if text: + if speaker: + conversation_text += f"{speaker}: {text}\n" + else: + conversation_text += f"{text}\n" + return conversation_text + + def _clean_and_truncate_title(self, title: str) -> str: + """Helper to clean and truncate title.""" + title = title.strip().strip('"').strip("'") + return title[:40] + "..." if len(title) > 40 else title or "Conversation" + + def _clean_and_truncate_summary(self, summary: str) -> str: + """Helper to clean and truncate summary.""" + summary = summary.strip().strip('"').strip("'") + return summary[:120] + "..." if len(summary) > 120 else summary or "No content" + + async def create_conversation_with_processing( + self, + audio_uuid: str, + transcript_data: dict, + speech_analysis: dict, + speaker_segments: List[Dict], + chunk_repo + ) -> Optional[str]: + """High-level method to create conversation with complete processing. + + This method handles: + 1. Basic conversation creation + 2. Title and summary generation + 3. Transcript version creation and activation + 4. Conversation updates with speaker info + + Args: + audio_uuid: Audio UUID for the conversation + transcript_data: Transcript data from transcription provider + speech_analysis: Speech detection analysis results + speaker_segments: Processed segments with speaker information + chunk_repo: AudioChunksRepository instance + + Returns: + conversation_id if successful, None if failed + """ + try: + # Step 1: Create basic conversation + conversation_id = await self.create_conversation( + audio_uuid, transcript_data, speech_analysis, chunk_repo + ) + if not conversation_id: + audio_logger.error(f"Failed to create basic conversation for {audio_uuid}") + return None + + # Step 2: Create and activate initial transcript version + conversations_repo = ConversationsRepository(conversations_col) + conversation = await conversations_repo.get_conversation(conversation_id) + + if conversation and not conversation.get("active_transcript_version"): + # Create initial transcript version + version_id = await conversations_repo.create_transcript_version( + conversation_id=conversation_id, + segments=speaker_segments, + provider="speech_detection", + raw_data={} + ) + if version_id: + # Activate this version + await conversations_repo.activate_transcript_version(conversation_id, version_id) + audio_logger.info(f"✅ Created and activated initial transcript version {version_id} for conversation {conversation_id}") + + # Step 3: Generate title and summary with speaker awareness + title = await self.generate_title(speaker_segments=speaker_segments) + summary = await self.generate_summary(speaker_segments=speaker_segments) + + # Step 4: Extract speaker information + speaker_names = {} + speakers_found = set() + for segment in speaker_segments: + speaker_name = segment.get("identified_as") or segment.get("speaker") + if speaker_name: + speakers_found.add(speaker_name) + # Map speaker_id to name if available + speaker_id = segment.get("speaker_id", "") + if speaker_id: + speaker_names[speaker_id] = speaker_name + + # Step 5: Update conversation with final content + update_data = { + "title": title, + "summary": summary, + "speaker_names": speaker_names, + "updated_at": datetime.now(UTC) + } + await conversations_repo.update_conversation(conversation_id, update_data) + + audio_logger.info(f"✅ Completed conversation processing for {conversation_id} with {len(speaker_segments)} segments, {len(speakers_found)} speakers") + return conversation_id + + except Exception as e: + audio_logger.error(f"Failed to create conversation with processing for {audio_uuid}: {e}", exc_info=True) + return None # Global singleton instance diff --git a/backends/advanced/src/advanced_omi_backend/main.py b/backends/advanced/src/advanced_omi_backend/main.py index f463f29d..f492ebd7 100644 --- a/backends/advanced/src/advanced_omi_backend/main.py +++ b/backends/advanced/src/advanced_omi_backend/main.py @@ -50,7 +50,6 @@ ) from advanced_omi_backend.audio_utils import process_audio_chunk from advanced_omi_backend.task_manager import init_task_manager, get_task_manager -from advanced_omi_backend.transcript_coordinator import get_transcript_coordinator from advanced_omi_backend.transcription_providers import get_transcription_provider from advanced_omi_backend.users import ( User, @@ -281,9 +280,6 @@ async def cleanup_client_state(client_id: str): except Exception as processor_cleanup_error: logger.error(f"Error cleaning up processor tasks for {client_id}: {processor_cleanup_error}") - # Clean up any orphaned transcript events for this client - coordinator = get_transcript_coordinator() - coordinator.cleanup_transcript_events_for_client(client_id) logger.info(f"Client {client_id} cleaned up successfully") else: diff --git a/backends/advanced/src/advanced_omi_backend/transcript_coordinator.py b/backends/advanced/src/advanced_omi_backend/transcript_coordinator.py deleted file mode 100644 index 696a7087..00000000 --- a/backends/advanced/src/advanced_omi_backend/transcript_coordinator.py +++ /dev/null @@ -1,172 +0,0 @@ -"""Transcript Coordinator for Event-Driven Memory Processing. - -This module provides proper async coordination between transcript completion and memory processing, -eliminating polling/retry mechanisms in favor of asyncio events. -""" - -import asyncio -import logging -from typing import Dict, Optional - -logger = logging.getLogger(__name__) - - -class TranscriptionFailed(Exception): - """Exception raised when transcription fails.""" - pass - - -class TranscriptCoordinator: - """Coordinates transcript completion events across the system. - - This replaces polling/retry mechanisms with proper asyncio event coordination. - When transcription is saved to the database, it signals waiting memory processors. - """ - - def __init__(self): - self.transcript_events: Dict[str, asyncio.Event] = {} - self.transcript_failures: Dict[str, str] = {} # audio_uuid -> error_message - self._lock = asyncio.Lock() - logger.info("TranscriptCoordinator initialized") - - async def wait_for_transcript_completion(self, audio_uuid: str, timeout: float = 30.0) -> bool: - """Wait for transcript completion for the given audio_uuid. - - Args: - audio_uuid: The audio UUID to wait for - timeout: Maximum time to wait in seconds - - Returns: - True if transcript was completed successfully, False if timeout or failed - - Raises: - TranscriptionFailed: If transcription failed with an error - """ - async with self._lock: - # Check if there's already a failure recorded before creating/waiting on event - if audio_uuid in self.transcript_failures: - error_msg = self.transcript_failures.pop(audio_uuid) - logger.error(f"Transcript already failed for {audio_uuid}: {error_msg}") - raise TranscriptionFailed(f"Transcription failed: {error_msg}") - - # Create event for this audio_uuid if it doesn't exist - if audio_uuid not in self.transcript_events: - self.transcript_events[audio_uuid] = asyncio.Event() - logger.info(f"Created transcript wait event for {audio_uuid}") - - event = self.transcript_events[audio_uuid] - - try: - # Wait for the transcript to be ready - await asyncio.wait_for(event.wait(), timeout=timeout) - - # Check if this was a failure (covers failures signaled during the wait) - if audio_uuid in self.transcript_failures: - error_msg = self.transcript_failures[audio_uuid] - logger.error(f"Transcript failed for {audio_uuid}: {error_msg}") - # Clean up failure tracking - self.transcript_failures.pop(audio_uuid, None) - raise TranscriptionFailed(f"Transcription failed: {error_msg}") - - logger.info(f"Transcript ready event received for {audio_uuid}") - return True - except asyncio.TimeoutError: - logger.warning(f"Transcript wait timeout ({timeout}s) for {audio_uuid}") - return False - finally: - # Clean up the event - async with self._lock: - self.transcript_events.pop(audio_uuid, None) - self.transcript_failures.pop(audio_uuid, None) - logger.debug(f"Cleaned up transcript event for {audio_uuid}") - - def signal_transcript_ready(self, audio_uuid: str): - """Signal that transcript is ready for the given audio_uuid. - - This should be called by TranscriptionManager after successfully saving - transcript segments to the database. - - Args: - audio_uuid: The audio UUID that has completed transcription - """ - if audio_uuid in self.transcript_events: - self.transcript_events[audio_uuid].set() - logger.info(f"Signaled transcript ready for {audio_uuid}") - else: - logger.debug(f"No waiting processors for transcript {audio_uuid}") - - def signal_transcript_failed(self, audio_uuid: str, error_message: str): - """Signal that transcript processing failed for the given audio_uuid. - - This should be called by TranscriptionManager when transcription fails. - Waiting processes will be unblocked and can check for failure status. - - Args: - audio_uuid: The audio UUID that failed transcription - error_message: Description of the failure - """ - # Store the failure message - self.transcript_failures[audio_uuid] = error_message - - # Always create an Event for the audio_uuid if missing so future waiters see the failure immediately - if audio_uuid not in self.transcript_events: - self.transcript_events[audio_uuid] = asyncio.Event() - logger.debug(f"Created transcript event for failed {audio_uuid}") - - # Set the event to unblock waiting processes (current and future) - self.transcript_events[audio_uuid].set() - logger.error(f"Signaled transcript failed for {audio_uuid}: {error_message}") - - def cleanup_transcript_events_for_client(self, client_id: str): - """Clean up any transcript events associated with a disconnected client. - - This prevents memory leaks and orphaned events when clients disconnect - before transcription completes. - - Args: - client_id: The client ID that disconnected - """ - # Since we don't track client_id -> audio_uuid mapping here, - # this is a safety method that can be called but currently has limited scope - # In the future, we could enhance this by tracking client associations - events_cleaned = 0 - for audio_uuid in list(self.transcript_events.keys()): - # For now, we'll rely on the timeout mechanism in wait_for_transcript_completion - # Future enhancement: track client_id associations to enable targeted cleanup - pass - - if events_cleaned > 0: - logger.info(f"Cleaned up {events_cleaned} transcript events for disconnected client {client_id}") - else: - logger.debug(f"No transcript events to clean up for client {client_id}") - - async def cleanup_stale_events(self, max_age_seconds: float = 300.0): - """Clean up any stale events that might be left over. - - This is a safety mechanism to prevent memory leaks if events are not - properly cleaned up during normal operation. - - Args: - max_age_seconds: Maximum age for events before cleanup - """ - async with self._lock: - # For now, just log the count - in a real implementation you'd track creation times - stale_count = len(self.transcript_events) - if stale_count > 0: - logger.warning(f"Found {stale_count} potentially stale transcript events") - - def get_waiting_count(self) -> int: - """Get the number of currently waiting transcript events.""" - return len(self.transcript_events) - - -# Global singleton instance -_transcript_coordinator: Optional[TranscriptCoordinator] = None - - -def get_transcript_coordinator() -> TranscriptCoordinator: - """Get the global TranscriptCoordinator instance.""" - global _transcript_coordinator - if _transcript_coordinator is None: - _transcript_coordinator = TranscriptCoordinator() - return _transcript_coordinator diff --git a/backends/advanced/src/advanced_omi_backend/transcription.py b/backends/advanced/src/advanced_omi_backend/transcription.py index f6ac919f..7068e305 100644 --- a/backends/advanced/src/advanced_omi_backend/transcription.py +++ b/backends/advanced/src/advanced_omi_backend/transcription.py @@ -12,6 +12,7 @@ get_speech_detection_settings, load_diarization_settings_from_file, ) +from advanced_omi_backend.conversation_manager import get_conversation_manager from advanced_omi_backend.database import ConversationsRepository, conversations_col from advanced_omi_backend.llm_client import async_generate from advanced_omi_backend.processors import ( @@ -20,7 +21,6 @@ get_processor_manager, ) from advanced_omi_backend.speaker_recognition_client import SpeakerRecognitionClient -from advanced_omi_backend.transcript_coordinator import get_transcript_coordinator from advanced_omi_backend.transcription_providers import ( BaseTranscriptionProvider, get_transcription_provider, @@ -212,9 +212,8 @@ async def process_collected_audio(self): await self.chunk_repo.update_transcription_status( self._current_audio_uuid, "FAILED", error_message=str(e) ) - # Signal coordinator about failure - coordinator = get_transcript_coordinator() - coordinator.signal_transcript_failed(self._current_audio_uuid, str(e)) + # Transcription failed + logger.error(f"Transcript failed for {self._current_audio_uuid}: {str(e)}") async def _get_transcript(self): """Get transcript from any provider using unified interface.""" @@ -270,12 +269,10 @@ async def _process_transcript_result(self, transcript_result): """Process transcript result uniformly for all providers.""" if not transcript_result or not self._current_audio_uuid: logger.info(f"⚠️ No transcript result to process for {self._current_audio_uuid}") - # Even with no transcript, signal completion to unblock memory processing + # No transcript to process if self._current_audio_uuid: - coordinator = get_transcript_coordinator() - coordinator.signal_transcript_ready(self._current_audio_uuid) logger.info( - f"⚠️ Signaled transcript completion (no data) for {self._current_audio_uuid}" + f"⚠️ No transcript data for {self._current_audio_uuid}" ) return @@ -297,11 +294,9 @@ async def _process_transcript_result(self, transcript_result): logger.warning( f"No text in normalized transcript result for {self._current_audio_uuid}" ) - # Signal completion even with empty text to unblock memory processing - coordinator = get_transcript_coordinator() - coordinator.signal_transcript_ready(self._current_audio_uuid) + # Empty transcript text logger.warning( - f"⚠️ Signaled transcript completion (empty text) for {self._current_audio_uuid}" + f"⚠️ Empty transcript text for {self._current_audio_uuid}" ) return @@ -324,30 +319,21 @@ async def _process_transcript_result(self, transcript_result): **{k: v for k, v in speech_analysis.items() if k != "has_speech"} ) - # Create conversation only if speech is detected + # Speech detection check - conversation will be created after speaker recognition conversation_id = None - if speech_analysis["has_speech"]: - conversation_id = await self._create_conversation( - self._current_audio_uuid, transcript_data, speech_analysis - ) - if conversation_id: - logger.info(f"✅ Created conversation {conversation_id} for detected speech in {self._current_audio_uuid}") - else: - logger.error(f"❌ Failed to create conversation for {self._current_audio_uuid}") - else: + if not speech_analysis["has_speech"]: logger.info(f"⏭️ No speech detected in {self._current_audio_uuid}: {speech_analysis.get('reason', 'Unknown reason')}") # Update transcript status to EMPTY for silent audio if self.chunk_repo: await self.chunk_repo.update_transcription_status( self._current_audio_uuid, "EMPTY", provider=provider_name ) - # Signal completion but don't queue memory processing - coordinator = get_transcript_coordinator() - coordinator.signal_transcript_ready(self._current_audio_uuid) + # No speech detected, not queuing memory processing + logger.info(f"No speech detected for {self._current_audio_uuid}") return - # SPEECH GAP ANALYSIS: Check for conversation closure (only if conversation exists) - if conversation_id: + # SPEECH GAP ANALYSIS: Check for conversation closure (only if speech detected) + if speech_analysis["has_speech"]: analyzer = SpeechActivityAnalyzer(self._audio_timeline) activity = analyzer.analyze_transcript_activity(transcript_data) @@ -368,9 +354,8 @@ async def _process_transcript_result(self, transcript_result): f"closing conversation for {self._client_id}" ) await self._trigger_conversation_close() - # Signal completion and return (conversation closed) - coordinator = get_transcript_coordinator() - coordinator.signal_transcript_ready(self._current_audio_uuid) + # Conversation closed due to inactivity + logger.info(f"Conversation closed for {self._current_audio_uuid}") return else: # Update last word time for next analysis @@ -472,51 +457,30 @@ async def _process_transcript_result(self, transcript_result): for speaker in speakers_found: await self.chunk_repo.add_speaker(self._current_audio_uuid, speaker) - # CRITICAL: Update conversation with transcript data - if conversation_id: - try: - conversations_repo = ConversationsRepository(conversations_col) - - # Check if this is the first transcript for this conversation - conversation = await conversations_repo.get_conversation(conversation_id) - if conversation and not conversation.get("active_transcript_version"): - # This is the first transcript - create initial version - version_id = await conversations_repo.create_transcript_version( - conversation_id=conversation_id, - segments=segments_to_store, - provider="speech_detection", - raw_data={} - ) - if version_id: - # Activate this version - await conversations_repo.activate_transcript_version(conversation_id, version_id) - logger.info(f"✅ Created and activated initial transcript version {version_id} for conversation {conversation_id}") - - # Generate title and summary with speaker information - title = await self._generate_title_with_speakers(segments_to_store) - summary = await self._generate_summary_with_speakers(segments_to_store) - - # Update conversation with speaker info, title, summary and metadata - update_data = { - "title": title, - "summary": summary, - "speaker_names": speaker_names, - "updated_at": datetime.now(UTC) - } - await conversations_repo.update_conversation(conversation_id, update_data) - - logger.info(f"✅ Updated conversation {conversation_id} with {len(segments_to_store)} transcript segments, {len(speakers_found)} speakers, and speaker-aware title/summary") - except Exception as e: - logger.error(f"Failed to update conversation {conversation_id} with transcript data: {e}") + conversation_manager = get_conversation_manager() + conversation_id = await conversation_manager.create_conversation_with_processing( + audio_uuid=self._current_audio_uuid, + transcript_data=transcript_data, + speech_analysis=speech_analysis, + speaker_segments=segments_to_store, + chunk_repo=self.chunk_repo + ) + + if not conversation_id: + logger.error(f"❌ Failed to create conversation for {self._current_audio_uuid}") + # Continue processing even if conversation creation fails + else: + # Edge case: speech detected but no segments processed + logger.warning(f"🚨 EDGE CASE: Speech detected but no segments processed for {self._current_audio_uuid}. Developer felt this edge case can never happen. Developer wants to sleep. 😴") + # If this actually happens, we should investigate why final_segments was empty # Update client state current_client = self._get_current_client() if current_client: current_client.update_transcript_received() - # Signal transcript coordinator - coordinator = get_transcript_coordinator() - coordinator.signal_transcript_ready(self._current_audio_uuid) + # Transcript processing completed + logger.info(f"Transcript completed for {self._current_audio_uuid}") # Queue memory processing now that transcription is complete (only for conversations with speech) if conversation_id: @@ -624,201 +588,6 @@ def _analyze_speech(self, transcript_data: dict): return {"has_speech": False, "reason": "No meaningful speech content detected"} - async def _create_conversation(self, audio_uuid: str, transcript_data: dict, speech_analysis: dict): - """Create conversation entry for detected speech.""" - try: - # Get audio session info from audio_chunks - audio_session = await self.chunk_repo.get_chunk(audio_uuid) - if not audio_session: - logger.error(f"No audio session found for {audio_uuid}") - return None - - # Create conversation data (title and summary will be generated after speaker recognition) - conversation_id = str(uuid.uuid4()) - conversation_data = { - "conversation_id": conversation_id, - "audio_uuid": audio_uuid, - "user_id": audio_session["user_id"], - "client_id": audio_session["client_id"], - "title": "Processing...", # Placeholder - will be updated after speaker recognition - "summary": "Processing...", # Placeholder - will be updated after speaker recognition - - # Versioned system (source of truth) - "transcript_versions": [], - "active_transcript_version": None, - "memory_versions": [], - "active_memory_version": None, - - # Legacy compatibility fields (auto-populated on read) - # Note: These will be auto-populated from active versions when retrieved - - "duration_seconds": speech_analysis.get("duration", 0.0), - "speech_start_time": speech_analysis.get("speech_start", 0.0), - "speech_end_time": speech_analysis.get("speech_end", 0.0), - "speaker_names": {}, - "action_items": [], - "created_at": datetime.now(UTC), - "updated_at": datetime.now(UTC), - "session_start": datetime.fromtimestamp(audio_session.get("timestamp", 0), tz=UTC), - "session_end": datetime.now(UTC), - } - - # Create conversation in conversations collection - conversations_repo = ConversationsRepository(conversations_col) - await conversations_repo.create_conversation(conversation_data) - - # Mark audio_chunks as having speech and link to conversation - await self.chunk_repo.mark_conversation_created(audio_uuid, conversation_id) - - logger.info(f"✅ Created conversation {conversation_id} for audio {audio_uuid} (speech detected)") - return conversation_id - - except Exception as e: - logger.error(f"Failed to create conversation for {audio_uuid}: {e}", exc_info=True) - return None - - async def _generate_title(self, text: str) -> str: - """Generate an LLM-powered title from conversation text.""" - if not text or len(text.strip()) < 10: - return "Conversation" - - try: - prompt = f"""Generate a concise, descriptive title (3-6 words) for this conversation transcript: - -"{text[:500]}" - -Rules: -- Maximum 6 words -- Capture the main topic or theme -- No quotes or special characters -- Examples: "Planning Weekend Trip", "Work Project Discussion", "Medical Appointment" - -Title:""" - - title = await async_generate(prompt, temperature=0.3) - return title.strip().strip('"').strip("'") or "Conversation" - - except Exception as e: - logger.warning(f"Failed to generate LLM title: {e}") - # Fallback to simple title generation - words = text.split()[:6] - title = " ".join(words) - return title[:40] + "..." if len(title) > 40 else title or "Conversation" - - async def _generate_summary(self, text: str) -> str: - """Generate an LLM-powered summary from conversation text.""" - if not text or len(text.strip()) < 10: - return "No content" - - try: - prompt = f"""Generate a brief, informative summary (1-2 sentences, max 120 characters) for this conversation: - -"{text[:1000]}" - -Rules: -- Maximum 120 characters -- 1-2 complete sentences -- Capture key topics and outcomes -- Use present tense -- Be specific and informative - -Summary:""" - - summary = await async_generate(prompt, temperature=0.3) - return summary.strip().strip('"').strip("'") or "No content" - - except Exception as e: - logger.warning(f"Failed to generate LLM summary: {e}") - # Fallback to simple summary generation - return text[:120] + "..." if len(text) > 120 else text or "No content" - - async def _generate_title_with_speakers(self, segments: list) -> str: - """Generate an LLM-powered title from conversation segments with speaker information.""" - if not segments: - return "Conversation" - - # Format conversation with speaker names - conversation_text = "" - for segment in segments[:10]: # Use first 10 segments for title generation - speaker = segment.get("speaker", "") - text = segment.get("text", "").strip() - if text: - if speaker: - conversation_text += f"{speaker}: {text}\n" - else: - conversation_text += f"{text}\n" - - if not conversation_text.strip(): - return "Conversation" - - try: - prompt = f"""Generate a concise title (max 40 characters) for this conversation: - -"{conversation_text[:500]}" - -Rules: -- Maximum 40 characters -- Include speaker names if relevant -- Capture the main topic -- Be specific and informative - -Title:""" - - title = await async_generate(prompt, temperature=0.3) - title = title.strip().strip('"').strip("'") - return title[:40] + "..." if len(title) > 40 else title or "Conversation" - - except Exception as e: - logger.warning(f"Failed to generate LLM title with speakers: {e}") - # Fallback to simple title generation - words = conversation_text.split()[:6] - title = " ".join(words) - return title[:40] + "..." if len(title) > 40 else title or "Conversation" - - async def _generate_summary_with_speakers(self, segments: list) -> str: - """Generate an LLM-powered summary from conversation segments with speaker information.""" - if not segments: - return "No content" - - # Format conversation with speaker names - conversation_text = "" - speakers_in_conv = set() - for segment in segments: - speaker = segment.get("speaker", "") - text = segment.get("text", "").strip() - if text: - if speaker: - conversation_text += f"{speaker}: {text}\n" - speakers_in_conv.add(speaker) - else: - conversation_text += f"{text}\n" - - if not conversation_text.strip(): - return "No content" - - try: - prompt = f"""Generate a brief, informative summary (1-2 sentences, max 120 characters) for this conversation with speakers: - -"{conversation_text[:1000]}" - -Rules: -- Maximum 120 characters -- 1-2 complete sentences -- Include speaker names when relevant (e.g., "John discusses X with Sarah") -- Capture key topics and outcomes -- Use present tense -- Be specific and informative - -Summary:""" - - summary = await async_generate(prompt, temperature=0.3) - return summary.strip().strip('"').strip("'") or "No content" - - except Exception as e: - logger.warning(f"Failed to generate LLM summary with speakers: {e}") - # Fallback to simple summary generation - return conversation_text[:120] + "..." if len(conversation_text) > 120 else conversation_text or "No content" - async def _queue_memory_processing(self, conversation_id: str): """Queue memory processing for a speech-detected conversation. From 515fb818318a219b6283bcf2caf0046c5752296f Mon Sep 17 00:00:00 2001 From: Ankush Malaker <43288948+AnkushMalaker@users.noreply.github.com> Date: Mon, 22 Sep 2025 05:17:18 +0000 Subject: [PATCH 7/7] cleanup and beautify --- .../src/advanced_omi_backend/audio_utils.py | 4 - .../src/advanced_omi_backend/client.py | 38 +----- .../advanced/src/advanced_omi_backend/main.py | 3 - backends/advanced/webui/src/pages/System.tsx | 129 +++++++++--------- 4 files changed, 65 insertions(+), 109 deletions(-) diff --git a/backends/advanced/src/advanced_omi_backend/audio_utils.py b/backends/advanced/src/advanced_omi_backend/audio_utils.py index 1a3937c7..88f03026 100644 --- a/backends/advanced/src/advanced_omi_backend/audio_utils.py +++ b/backends/advanced/src/advanced_omi_backend/audio_utils.py @@ -87,10 +87,6 @@ async def process_audio_chunk( await processor_manager.queue_audio(processing_item) - # Update client state if provided - if client_state is not None: - client_state.update_audio_received(chunk) - async def load_audio_file_as_chunk(audio_path: Path) -> AudioChunk: """Load existing audio file into Wyoming AudioChunk format for reprocessing. diff --git a/backends/advanced/src/advanced_omi_backend/client.py b/backends/advanced/src/advanced_omi_backend/client.py index 3c43a43a..4cb10999 100644 --- a/backends/advanced/src/advanced_omi_backend/client.py +++ b/backends/advanced/src/advanced_omi_backend/client.py @@ -5,9 +5,7 @@ application level by the ProcessorManager. """ -import asyncio import logging -import os import time from pathlib import Path from typing import Dict, List, Optional, Tuple @@ -15,14 +13,10 @@ from advanced_omi_backend.conversation_manager import get_conversation_manager from advanced_omi_backend.database import AudioChunksRepository from advanced_omi_backend.task_manager import get_task_manager -from wyoming.audio import AudioChunk # Get loggers audio_logger = logging.getLogger("audio_processing") -# Configuration constants -NEW_CONVERSATION_TIMEOUT_MINUTES = float(os.getenv("NEW_CONVERSATION_TIMEOUT_MINUTES", "1.5")) - class ClientState: """Manages conversation state for a single client connection.""" @@ -67,11 +61,6 @@ def __init__( audio_logger.info(f"Created client state for {client_id}") - def update_audio_received(self, chunk: AudioChunk): - """Update state when audio is received.""" - # Check if we should start a new conversation - if self.should_start_new_conversation(): - asyncio.create_task(self.start_new_conversation()) def set_current_audio_uuid(self, audio_uuid: str): """Set the current audio UUID when processor creates a new file.""" @@ -104,20 +93,9 @@ def record_speech_end(self, audio_uuid: str, timestamp: float): audio_logger.warning(f"Speech end recorded for {audio_uuid} but no start time found") def update_transcript_received(self): - """Update timestamp when transcript is received (for timeout detection).""" + """Update timestamp when transcript is received.""" self.last_transcript_time = time.time() - def should_start_new_conversation(self) -> bool: - """Check if we should start a new conversation based on timeout.""" - if self.last_transcript_time is None: - return False - - current_time = time.time() - time_since_last_transcript = current_time - self.last_transcript_time - timeout_seconds = NEW_CONVERSATION_TIMEOUT_MINUTES * 60 - - return time_since_last_transcript > timeout_seconds - async def close_current_conversation(self): """Close the current conversation and queue necessary processing.""" # Prevent double closure @@ -161,20 +139,6 @@ async def close_current_conversation(self): else: audio_logger.warning(f"⚠️ Conversation closure had issues for {self.current_audio_uuid}") - async def start_new_conversation(self): - """Start a new conversation by closing current and resetting state.""" - await self.close_current_conversation() - - # Reset conversation state - self.current_audio_uuid = None - self.conversation_start_time = time.time() - self.last_transcript_time = None - self.conversation_closed = False - - audio_logger.info( - f"Client {self.client_id}: Started new conversation due to " - f"{NEW_CONVERSATION_TIMEOUT_MINUTES}min timeout" - ) async def disconnect(self): """Clean disconnect of client state.""" diff --git a/backends/advanced/src/advanced_omi_backend/main.py b/backends/advanced/src/advanced_omi_backend/main.py index f492ebd7..5d40c18d 100644 --- a/backends/advanced/src/advanced_omi_backend/main.py +++ b/backends/advanced/src/advanced_omi_backend/main.py @@ -112,8 +112,6 @@ SEGMENT_SECONDS = 60 # length of each stored chunk TARGET_SAMPLES = OMI_SAMPLE_RATE * SEGMENT_SECONDS -# Conversation timeout configuration -NEW_CONVERSATION_TIMEOUT_MINUTES = float(os.getenv("NEW_CONVERSATION_TIMEOUT_MINUTES", "1.5")) # Audio cropping configuration AUDIO_CROPPING_ENABLED = os.getenv("AUDIO_CROPPING_ENABLED", "true").lower() == "true" @@ -977,7 +975,6 @@ async def health_check(): ), "chunk_dir": str(CHUNK_DIR), "active_clients": client_manager.get_client_count(), - "new_conversation_timeout_minutes": NEW_CONVERSATION_TIMEOUT_MINUTES, "audio_cropping_enabled": AUDIO_CROPPING_ENABLED, "llm_provider": os.getenv("LLM_PROVIDER"), "llm_model": os.getenv("OPENAI_MODEL"), diff --git a/backends/advanced/webui/src/pages/System.tsx b/backends/advanced/webui/src/pages/System.tsx index 9c1b34eb..991045a1 100644 --- a/backends/advanced/webui/src/pages/System.tsx +++ b/backends/advanced/webui/src/pages/System.tsx @@ -215,54 +215,54 @@ export default function System() { )} -
- {/* Services Status */} - {healthData?.services && ( -
-

- - Services Status -

-
- {Object.entries(healthData.services).map(([service, status]) => ( -
-
- {getStatusIcon(status.healthy)} - - {getServiceDisplayName(service)} + {/* Services Status - Full Width */} + {healthData?.services && ( +
+

+ + Services Status +

+
+ {Object.entries(healthData.services).map(([service, status]) => ( +
+
+ {getStatusIcon(status.healthy)} + + {getServiceDisplayName(service)} + +
+
+ {status.message && ( + + {status.message} -
-
- {status.message && ( - - {status.message} - - )} - {(status as any).status && ( - - {(status as any).status} - - )} - {(status as any).provider && ( - - ({(status as any).provider}) - - )} -
+ )} + {(status as any).status && ( + + {(status as any).status} + + )} + {(status as any).provider && ( + + ({(status as any).provider}) + + )}
- ))} -
+
+ ))}
- )} - +
+ )} + {/* Diarization & Speaker Settings - Always Horizontal */} +
{/* Diarization Settings */}

Diarization Settings

- +
{/* Diarization Source Selector */}
@@ -304,7 +304,7 @@ export default function System() {
- {diarizationSettings.diarization_source === 'deepgram' + {diarizationSettings.diarization_source === 'deepgram' ? 'Deepgram handles diarization automatically. The parameters below apply only to speaker identification.' : 'Pyannote provides local diarization with full parameter control.' } @@ -321,7 +321,7 @@ export default function System() { Note: Deepgram Diarization Mode

- Ignored parameters hidden: speaker count, collar, timing settings. + Ignored parameters hidden: speaker count, collar, timing settings. Only similarity threshold applies to speaker identification.

@@ -475,37 +475,36 @@ export default function System() { {/* Speaker Configuration */} +
- - {/* Debug Metrics */} - {metricsData?.debug_tracker && ( -
-

- Debug Metrics -

-
-
-
Total Files
-
- {metricsData.debug_tracker.total_files} -
+ {/* Debug Metrics */} + {metricsData?.debug_tracker && ( +
+

+ Debug Metrics +

+
+
+
Total Files
+
+ {metricsData.debug_tracker.total_files}
-
-
Processed
-
- {metricsData.debug_tracker.processed_files} -
+
+
+
Processed
+
+ {metricsData.debug_tracker.processed_files}
-
-
Failed
-
- {metricsData.debug_tracker.failed_files} -
+
+
+
Failed
+
+ {metricsData.debug_tracker.failed_files}
- )} -
+
+ )} {/* Memory Configuration - Full Width Section */}