feat: allow live model switching on running agentic sessions (RHOAIENG-56044)

Gkrumbach07 · claude · Gkrumbach07 · commit 65d749e9a972 · 2026-03-31T15:41:45.000Z
Enable users to change the LLM model on running sessions without stopping/restarting. Model changes take effect on the next AG-UI run, preserving conversation history. Backend Changes: - Remove Running/Creating phase restriction for llmSettings updates in UpdateSession - Add model validation against manifest and runner provider compatibility - Pass current model to runner via X-Current-Model and X-Current-Model-Vertex-ID headers - Resolve Vertex AI model IDs from manifest when needed Runner Changes: - Extract model from request headers on each /run invocation - Inject model into forwarded_props to override env var for that run - Support Vertex AI model ID override via LLM_MODEL_VERTEX_ID env var Frontend Changes: - Add UpdateAgenticSessionRequest type with llmSettings field - Add updateSession() API function for PATCH /agentic-sessions/:name - (UI implementation deferred - users can update via API directly) Architecture: - Next-run-only approach: no pod restart needed, leverages existing forwarded_props mechanism - Backend resolves Vertex ID from ConfigMap, consistent with pod creation flow - No context handoff: conversation history replayed naturally (works for same-family switches) References: RHOAIENG-56044, GitHub Issue #1090 Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
diff --git a/components/backend/handlers/sessions.go b/components/backend/handlers/sessions.go
@@ -1215,17 +1215,57 @@ func UpdateSession(c *gin.Context) {
 		return
 	}
 
-	// Prevent spec changes while session is running or being created
+	// Prevent most spec changes while session is running or being created
+	// Exception: llmSettings.model can be updated live (takes effect on next run)
 	if status, ok := item.Object["status"].(map[string]interface{}); ok {
 		if phase, ok := status["phase"].(string); ok {
 			if strings.EqualFold(phase, "Running") || strings.EqualFold(phase, "Creating") {
-				c.JSON(http.StatusConflict, gin.H{
-					"error": "Cannot modify session specification while the session is running",
-					"phase": phase,
-				})
-				return
+				// Check if request is ONLY updating llmSettings (live model switching)
+				isOnlyModelUpdate := req.LLMSettings != nil &&
+					req.InitialPrompt == nil &&
+					req.DisplayName == nil &&
+					req.Timeout == nil
+
+				if !isOnlyModelUpdate {
+					c.JSON(http.StatusConflict, gin.H{
+						"error": "Cannot modify session specification while the session is running (except llmSettings)",
+						"phase": phase,
+					})
+					return
+				}
+				log.Printf("Live model update on Running session %s: %+v", sessionName, req.LLMSettings)
+			}
+		}
+	}
+
+	// Validate model if being updated
+	if req.LLMSettings != nil && req.LLMSettings.Model != "" {
+		// Get runner type from existing session spec to determine provider
+		spec := item.Object["spec"].(map[string]interface{})
+		runnerTypeID := DefaultRunnerType // default
+		if envVars, ok := spec["environmentVariables"].(map[string]interface{}); ok {
+			if rt, ok := envVars["RUNNER_TYPE"].(string); ok && rt != "" {
+				runnerTypeID = rt
 			}
 		}
+
+		// Resolve provider for validation
+		runnerProvider := ""
+		if rt, rtErr := GetRuntime(runnerTypeID); rtErr == nil {
+			runnerProvider = rt.Provider
+		} else {
+			log.Printf("WARNING: could not resolve runner type %q from registry: %v", runnerTypeID, rtErr)
+		}
+
+		// Validate model is available for this runner's provider
+		k8sClt, _ := GetK8sClientsForRequest(c)
+		if !isModelAvailable(c.Request.Context(), k8sClt, req.LLMSettings.Model, runnerProvider, project) {
+			c.JSON(http.StatusBadRequest, gin.H{
+				"error": "Model is not available for this runner type",
+				"model": req.LLMSettings.Model,
+			})
+			return
+		}
 	}
 
 	// Update spec
diff --git a/components/backend/websocket/agui_proxy.go b/components/backend/websocket/agui_proxy.go
@@ -367,8 +367,12 @@ func HandleAGUIRunProxy(c *gin.Context) {
 		log.Printf("Run with per-user credentials for %s/%s", projectName, sessionName)
 	}
 
+	// Fetch current model from session spec for live model switching
+	currentModel := getCurrentModel(projectName, sessionName)
+	currentModelVertexID := getCurrentModelVertexID(projectName, sessionName, currentModel)
+
 	// Start background goroutine to proxy runner SSE → persist + broadcast
-	go proxyRunnerStream(runnerURL, bodyBytes, sessionName, runID, threadID, currentUserID, currentUserName, callerToken)
+	go proxyRunnerStream(runnerURL, bodyBytes, sessionName, runID, threadID, currentUserID, currentUserName, callerToken, currentModel, currentModelVertexID)
 
 	// Return metadata immediately — events arrive via GET /agui/events
 	c.JSON(http.StatusOK, gin.H{
@@ -382,13 +386,14 @@ func HandleAGUIRunProxy(c *gin.Context) {
 // a background goroutine so the POST /agui/run handler can return immediately.
 // If userID is provided, forwards user context headers for credential scoping.
 // callerToken is the original user's bearer token for per-user credential requests.
-func proxyRunnerStream(runnerURL string, bodyBytes []byte, sessionName, runID, threadID, userID, userName, callerToken string) {
+// currentModel and currentModelVertexID are forwarded to enable live model switching.
+func proxyRunnerStream(runnerURL string, bodyBytes []byte, sessionName, runID, threadID, userID, userName, callerToken, currentModel, currentModelVertexID string) {
 	logSuffix := ""
 	if userID != "" {
 		logSuffix = fmt.Sprintf(" (user=%s)", userID)
 	}
 	log.Printf("AGUI Proxy: connecting to runner at %s%s", runnerURL, logSuffix)
-	resp, err := connectToRunner(runnerURL, bodyBytes, userID, userName, callerToken)
+	resp, err := connectToRunner(runnerURL, bodyBytes, userID, userName, callerToken, currentModel, currentModelVertexID)
 	if err != nil {
 		log.Printf("AGUI Proxy: runner unavailable for %s: %v", sessionName, err)
 		// Publish error events so GET /agui/events subscribers see the failure
@@ -784,7 +789,7 @@ var runnerHTTPClient = &http.Client{
 // container startup time and K8s Service DNS propagation. Retries on
 // "connection refused", "no such host", and "dial tcp" errors with
 // exponential backoff (500ms initial, 1.5x, capped at 5s, 15 attempts).
-func connectToRunner(runnerURL string, bodyBytes []byte, userID, userName, callerToken string) (*http.Response, error) {
+func connectToRunner(runnerURL string, bodyBytes []byte, userID, userName, callerToken, currentModel, currentModelVertexID string) (*http.Response, error) {
 	maxAttempts := 15
 	retryDelay := 500 * time.Millisecond
 	maxDelay := 5 * time.Second
@@ -809,6 +814,13 @@ func connectToRunner(runnerURL string, bodyBytes []byte, userID, userName, calle
 		if callerToken != "" {
 			req.Header.Set("X-Caller-Token", callerToken)
 		}
+		// Forward current model for live model switching
+		if currentModel != "" {
+			req.Header.Set("X-Current-Model", currentModel)
+		}
+		if currentModelVertexID != "" {
+			req.Header.Set("X-Current-Model-Vertex-ID", currentModelVertexID)
+		}
 
 		resp, err := runnerHTTPClient.Do(req)
 		if err == nil {
@@ -1290,3 +1302,52 @@ func HandleTaskList(c *gin.Context) {
 	body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024*1024))
 	c.Data(resp.StatusCode, resp.Header.Get("Content-Type"), body)
 }
+
+// ─── Live Model Switching Helpers ────────────────────────────────────
+
+// getCurrentModel fetches the current model from the session spec.
+// Returns empty string if unable to read (runner will fall back to env var).
+func getCurrentModel(projectName, sessionName string) string {
+	if handlers.DynamicClient == nil {
+		return ""
+	}
+
+	gvr := handlers.GetAgenticSessionV1Alpha1Resource()
+	obj, err := handlers.DynamicClient.Resource(gvr).Namespace(projectName).Get(
+		context.Background(), sessionName, metav1.GetOptions{},
+	)
+	if err != nil {
+		return ""
+	}
+
+	model, _, _ := unstructured.NestedString(obj.Object, "spec", "llmSettings", "model")
+	return model
+}
+
+// getCurrentModelVertexID resolves the Vertex AI model ID for the given model.
+// Returns empty string if Vertex ID mapping is not needed or unavailable.
+func getCurrentModelVertexID(projectName, sessionName, model string) string {
+	if model == "" {
+		return ""
+	}
+
+	// Load model manifest from ConfigMap mount
+	manifestPath := handlers.ManifestPath()
+	manifest, err := handlers.LoadManifest(manifestPath)
+	if err != nil {
+		log.Printf("WARNING: failed to load model manifest for Vertex ID resolution: %v", err)
+		return ""
+	}
+
+	// Find model entry and extract Vertex ID
+	for _, entry := range manifest.Models {
+		if entry.ID == model && entry.Available {
+			if entry.VertexID != "" {
+				log.Printf("Resolved Vertex ID for model %q: %s", model, entry.VertexID)
+			}
+			return entry.VertexID
+		}
+	}
+
+	return ""
+}
diff --git a/components/frontend/src/services/api/sessions.ts b/components/frontend/src/services/api/sessions.ts
@@ -12,6 +12,7 @@ import type {
   ListAgenticSessionsPaginatedResponse,
   StopAgenticSessionRequest,
   StopAgenticSessionResponse,
+  UpdateAgenticSessionRequest,
   CloneAgenticSessionRequest,
   CloneAgenticSessionResponse,
   PaginationParams,
@@ -188,6 +189,20 @@ export async function getSessionPodEvents(
   return apiClient.get(`/projects/${projectName}/agentic-sessions/${sessionName}/pod-events`);
 }
 
+/**
+ * Update session spec (supports live model switching)
+ */
+export async function updateSession(
+  projectName: string,
+  sessionName: string,
+  updates: UpdateAgenticSessionRequest
+): Promise<AgenticSession> {
+  return apiClient.put<AgenticSession, UpdateAgenticSessionRequest>(
+    `/projects/${projectName}/agentic-sessions/${sessionName}`,
+    updates
+  );
+}
+
 /**
  * Update the display name of a session
  */
diff --git a/components/frontend/src/types/api/sessions.ts b/components/frontend/src/types/api/sessions.ts
@@ -180,6 +180,13 @@ export type StopAgenticSessionResponse = {
   message: string;
 };
 
+export type UpdateAgenticSessionRequest = {
+  llmSettings?: Partial<LLMSettings>;
+  displayName?: string;
+  initialPrompt?: string;
+  timeout?: number;
+};
+
 export type CloneAgenticSessionRequest = {
   targetProject: string;
   newSessionName: string;
diff --git a/components/runners/ambient-runner/ambient_runner/bridges/claude/bridge.py b/components/runners/ambient-runner/ambient_runner/bridges/claude/bridge.py
@@ -140,12 +140,29 @@ async def run(
         current_user_id: str = "",
         current_user_name: str = "",
         caller_token: str = "",
+        current_model: str = "",
+        current_model_vertex_id: str = "",
     ) -> AsyncIterator[BaseEvent]:
-        """Full run lifecycle: initialize → session worker → tracing."""
+        """Full run lifecycle: initialize → session worker → tracing.
+
+        Live model switching: current_model and current_model_vertex_id from
+        the backend override the env var model for this run.
+        """
         thread_id = input_data.thread_id or (self._context.session_id if self._context else "")
 
         await self._initialize_run(thread_id, current_user_id, current_user_name, caller_token)
 
+        # Live model switching: inject current model into forwarded_props
+        if current_model:
+            if input_data.forwarded_props is None:
+                input_data.forwarded_props = {}
+            input_data.forwarded_props["model"] = current_model
+            logger.info(f"Live model switch: using {current_model}")
+            # Update env var for Vertex AI mapping (used by setup_sdk_authentication)
+            if current_model_vertex_id:
+                os.environ["LLM_MODEL_VERTEX_ID"] = current_model_vertex_id
+                logger.info(f"Vertex ID set: {current_model_vertex_id}")
+
         from ag_ui_claude_sdk.utils import process_messages
 
         user_msg, _ = process_messages(input_data)
diff --git a/components/runners/ambient-runner/ambient_runner/endpoints/run.py b/components/runners/ambient-runner/ambient_runner/endpoints/run.py
@@ -68,13 +68,18 @@ async def run_agent(input_data: RunnerInput, request: Request):
     # The caller's bearer token — used for credential requests so each user
     # can only access their own credentials (no BOT_TOKEN impersonation).
     caller_token = request.headers.get("x-caller-token", "")
+    # Extract current model for live model switching
+    current_model = request.headers.get("x-current-model", "")
+    current_model_vertex_id = request.headers.get("x-current-model-vertex-id", "")
     if current_user_id:
         from ambient_runner.platform.auth import sanitize_user_context
 
         current_user_id, current_user_name = sanitize_user_context(
             current_user_id, current_user_name
         )
         logger.info(f"Run user context: {current_user_id}")
+    if current_model:
+        logger.info(f"Run with live model: {current_model}")
 
     logger.info(
         f"Run: thread_id={run_agent_input.thread_id}, run_id={run_agent_input.run_id}"
@@ -87,6 +92,8 @@ async def event_stream():
                 current_user_id=current_user_id,
                 current_user_name=current_user_name,
                 caller_token=caller_token,
+                current_model=current_model,
+                current_model_vertex_id=current_model_vertex_id,
             ):
                 try:
                     yield encoder.encode(event)