virtual-banker/backend/orchestrator/orchestrator.go

package orchestrator

import (
	"context"
	"fmt"
	"sync"
	"time"

	"github.com/explorer/virtual-banker/backend/asr"
	"github.com/explorer/virtual-banker/backend/llm"
	"github.com/explorer/virtual-banker/backend/rag"
	"github.com/explorer/virtual-banker/backend/tools"
	"github.com/explorer/virtual-banker/backend/tts"
)

// State represents the conversation state
type State string

const (
	StateIdle      State = "IDLE"
	StateListening State = "LISTENING"
	StateThinking  State = "THINKING"
	StateSpeaking  State = "SPEAKING"
)

// Orchestrator orchestrates conversation flow
type Orchestrator struct {
	sessions map[string]*SessionOrchestrator
	mu       sync.RWMutex
	asr      asr.Service
	tts      tts.Service
	llm      llm.Gateway
	rag      rag.Service
	tools    *tools.Executor
}

// NewOrchestrator creates a new orchestrator
func NewOrchestrator(asrService asr.Service, ttsService tts.Service, llmGateway llm.Gateway, ragService rag.Service, toolExecutor *tools.Executor) *Orchestrator {
	return &Orchestrator{
		sessions: make(map[string]*SessionOrchestrator),
		asr:      asrService,
		tts:      ttsService,
		llm:      llmGateway,
		rag:      ragService,
		tools:    toolExecutor,
	}
}

// SessionOrchestrator manages a single session's conversation
type SessionOrchestrator struct {
	sessionID    string
	tenantID     string
	userID       string
	state        State
	mu           sync.RWMutex
	ctx          context.Context
	cancel       context.CancelFunc
	asr          asr.Service
	tts          tts.Service
	llm          llm.Gateway
	rag          rag.Service
	tools        *tools.Executor
	conversation []llm.Message
}

// GetOrCreateSession gets or creates a session orchestrator
func (o *Orchestrator) GetOrCreateSession(sessionID, tenantID, userID string) *SessionOrchestrator {
	o.mu.RLock()
	sess, ok := o.sessions[sessionID]
	o.mu.RUnlock()

	if ok {
		return sess
	}

	o.mu.Lock()
	defer o.mu.Unlock()

	// Double-check
	if sess, ok := o.sessions[sessionID]; ok {
		return sess
	}

	ctx, cancel := context.WithCancel(context.Background())
	sess = &SessionOrchestrator{
		sessionID:    sessionID,
		tenantID:     tenantID,
		userID:       userID,
		state:        StateIdle,
		ctx:          ctx,
		cancel:       cancel,
		asr:          o.asr,
		tts:          o.tts,
		llm:          o.llm,
		rag:          o.rag,
		tools:        o.tools,
		conversation: []llm.Message{},
	}

	o.sessions[sessionID] = sess
	return sess
}

// ProcessAudio processes incoming audio
func (so *SessionOrchestrator) ProcessAudio(ctx context.Context, audioData []byte) error {
	so.mu.Lock()
	currentState := so.state
	so.mu.Unlock()

	// Handle barge-in: if speaking, stop and switch to listening
	if currentState == StateSpeaking {
		so.StopSpeaking()
	}

	so.SetState(StateListening)

	// Transcribe audio
	transcript, err := so.asr.Transcribe(ctx, audioData)
	if err != nil {
		return fmt.Errorf("failed to transcribe: %w", err)
	}

	// Process transcript
	so.SetState(StateThinking)
	response, err := so.processTranscript(ctx, transcript)
	if err != nil {
		return fmt.Errorf("failed to process transcript: %w", err)
	}

	// Synthesize response
	so.SetState(StateSpeaking)
	return so.speak(ctx, response)
}

// ProcessText processes incoming text message
func (so *SessionOrchestrator) ProcessText(ctx context.Context, text string) error {
	so.SetState(StateThinking)

	// Process text
	response, err := so.processTranscript(ctx, text)
	if err != nil {
		return fmt.Errorf("failed to process text: %w", err)
	}

	// Synthesize response
	so.SetState(StateSpeaking)
	return so.speak(ctx, response)
}

// processTranscript processes a transcript and generates a response
func (so *SessionOrchestrator) processTranscript(ctx context.Context, transcript string) (string, error) {
	// Add user message to conversation
	so.conversation = append(so.conversation, llm.Message{
		Role:    "user",
		Content: transcript,
	})

	// Retrieve relevant documents from RAG
	var retrievedDocs []rag.RetrievedDoc
	if so.rag != nil {
		docs, err := so.rag.Retrieve(ctx, transcript, so.tenantID, 5)
		if err == nil {
			retrievedDocs = docs
		}
	}

	// Build prompt with RAG context
	// Convert retrieved docs to LLM format
	ragDocs := make([]llm.RetrievedDoc, len(retrievedDocs))
	for i, doc := range retrievedDocs {
		ragDocs[i] = llm.RetrievedDoc{
			Title:   doc.Title,
			Content: doc.Content,
			URL:     doc.URL,
			Score:   doc.Score,
		}
	}

	// Get available tools (would come from tenant config)
	availableTools := []llm.Tool{} // TODO: Get from tenant config

	// Call LLM
	options := &llm.GenerateOptions{
		Temperature:         0.7,
		MaxTokens:           500,
		Tools:               availableTools,
		TenantID:            so.tenantID,
		UserID:              so.userID,
		ConversationHistory: so.conversation,
	}

	response, err := so.llm.Generate(ctx, transcript, options)
	if err != nil {
		return "", fmt.Errorf("failed to generate response: %w", err)
	}

	// Execute tool calls if any
	if len(response.Tools) > 0 && so.tools != nil {
		for _, toolCall := range response.Tools {
			result, err := so.tools.Execute(ctx, toolCall.Name, toolCall.Arguments, so.userID, so.tenantID)
			if err != nil {
				// Log error but continue
				fmt.Printf("Tool execution error: %v\n", err)
				continue
			}

			// Add tool result to conversation
			if result.Success {
				so.conversation = append(so.conversation, llm.Message{
					Role:    "assistant",
					Content: fmt.Sprintf("Tool %s executed successfully: %v", toolCall.Name, result.Data),
				})
			}
		}
	}

	// Add assistant response to conversation
	so.conversation = append(so.conversation, llm.Message{
		Role:    "assistant",
		Content: response.Text,
	})

	return response.Text, nil
}

// speak synthesizes and plays audio
func (so *SessionOrchestrator) speak(ctx context.Context, text string) error {
	// Synthesize audio
	audioData, err := so.tts.Synthesize(ctx, text)
	if err != nil {
		return fmt.Errorf("failed to synthesize: %w", err)
	}

	// Get visemes for avatar
	visemes, err := so.tts.GetVisemes(ctx, text)
	if err != nil {
		// Log error but continue
		fmt.Printf("Failed to get visemes: %v\n", err)
	}

	// TODO: Send audio and visemes to client via WebRTC/WebSocket
	_ = audioData
	_ = visemes

	// Simulate speaking duration
	time.Sleep(time.Duration(len(text)*50) * time.Millisecond)

	so.SetState(StateIdle)
	return nil
}

// StopSpeaking stops current speech (barge-in)
func (so *SessionOrchestrator) StopSpeaking() {
	so.mu.Lock()
	defer so.mu.Unlock()

	if so.state == StateSpeaking {
		// Cancel current TTS synthesis
		so.cancel()
		ctx, cancel := context.WithCancel(context.Background())
		so.ctx = ctx
		so.cancel = cancel
		so.state = StateIdle
	}
}

// SetState sets the conversation state
func (so *SessionOrchestrator) SetState(state State) {
	so.mu.Lock()
	defer so.mu.Unlock()
	so.state = state
}

// GetState gets the current conversation state
func (so *SessionOrchestrator) GetState() State {
	so.mu.RLock()
	defer so.mu.RUnlock()
	return so.state
}

// Close closes the session orchestrator
func (so *SessionOrchestrator) Close() {
	so.cancel()
}