From 318933d53efa49f866040607b5c31f1d03e2e664 Mon Sep 17 00:00:00 2001
From: Peter Jausovec <peter.jausovec@solo.io>
Date: Thu, 11 Jun 2026 16:02:45 -0700
Subject: [PATCH 1/2] implement agent reliability features from ADK

Signed-off-by: Peter Jausovec <peter.jausovec@solo.io>
---
 docs/architecture/crds-and-types.md           |  13 +-
 go/adk/pkg/agent/agent.go                     |  14 +
 go/adk/pkg/models/anthropic.go                |   9 +
 go/adk/pkg/models/base.go                     |   1 +
 go/adk/pkg/models/openai.go                   |   6 +
 go/adk/pkg/runner/adapter.go                  |  51 +++
 go/adk/pkg/runner/adapter_test.go             | 128 ++++++++
 go/adk/pkg/runner/maxllmcalls.go              |  42 +++
 go/api/adk/types.go                           |  21 ++
 .../config/crd/bases/kagent.dev_agents.yaml   |  29 ++
 .../crd/bases/kagent.dev_modelconfigs.yaml    |  20 ++
 .../crd/bases/kagent.dev_sandboxagents.yaml   |  29 ++
 go/api/v1alpha2/agent_types.go                |  29 ++
 go/api/v1alpha2/modelconfig_types.go          |  20 ++
 go/api/v1alpha2/zz_generated.deepcopy.go      |  50 +++
 .../translator/agent/adk_api_translator.go    |  51 +--
 .../controller/translator/agent/compiler.go   |  13 +
 .../inputs/agent_with_reliability.yaml        |  37 +++
 .../inputs/modelconfig_with_retry.yaml        |  35 ++
 .../outputs/agent_with_reliability.json       | 299 ++++++++++++++++++
 .../outputs/modelconfig_with_retry.json       | 295 +++++++++++++++++
 .../templates/kagent.dev_agents.yaml          |  29 ++
 .../templates/kagent.dev_modelconfigs.yaml    |  20 ++
 .../templates/kagent.dev_sandboxagents.yaml   |  29 ++
 .../kagent-adk/src/kagent/adk/_a2a.py         |   9 +-
 .../src/kagent/adk/_agent_executor.py         |  22 +-
 .../src/kagent/adk/_reflect_retry_plugin.py   |  33 ++
 .../packages/kagent-adk/src/kagent/adk/cli.py |  23 ++
 .../adk/converters/request_converter.py       |   7 +-
 .../src/kagent/adk/models/_anthropic.py       |   6 +
 .../src/kagent/adk/models/_openai.py          |  14 +
 .../kagent-adk/src/kagent/adk/types.py        |  40 ++-
 .../unittests/models/test_sap_ai_core.py      |   1 -
 .../tests/unittests/test_model_retry.py       |  74 +++++
 .../unittests/test_reliability_config.py      |  96 ++++++
 ui/src/app/actions/agents.ts                  |  16 +
 ui/src/app/agents/new/page.tsx                | 108 ++++++-
 ui/src/app/models/new/page.tsx                |  54 +++-
 ui/src/components/AgentsProvider.tsx          |   6 +
 ui/src/types/index.ts                         |  17 +
 40 files changed, 1762 insertions(+), 34 deletions(-)
 create mode 100644 go/adk/pkg/runner/adapter_test.go
 create mode 100644 go/adk/pkg/runner/maxllmcalls.go
 create mode 100644 go/core/internal/controller/translator/agent/testdata/inputs/agent_with_reliability.yaml
 create mode 100644 go/core/internal/controller/translator/agent/testdata/inputs/modelconfig_with_retry.yaml
 create mode 100644 go/core/internal/controller/translator/agent/testdata/outputs/agent_with_reliability.json
 create mode 100644 go/core/internal/controller/translator/agent/testdata/outputs/modelconfig_with_retry.json
 create mode 100644 python/packages/kagent-adk/src/kagent/adk/_reflect_retry_plugin.py
 create mode 100644 python/packages/kagent-adk/tests/unittests/test_model_retry.py
 create mode 100644 python/packages/kagent-adk/tests/unittests/test_reliability_config.py

diff --git a/docs/architecture/crds-and-types.md b/docs/architecture/crds-and-types.md
index 7b0a084400..d886e114e2 100644
--- a/docs/architecture/crds-and-types.md
+++ b/docs/architecture/crds-and-types.md
@@ -74,6 +74,10 @@ AgentSpec
 │   │       ├── summarizer: ContextSummarizerConfig
 │   │       ├── tokenThreshold: int
 │   │       └── eventRetentionSize: int
+│   ├── reliability: ReliabilityConfig
+│   │   ├── toolRetries: int (reflect-and-retry on failed tool calls)
+│   │   ├── maxLLMCalls: int (cap on model calls per request)
+│   │   └── debugLogging: bool (log every LLM request/response and tool call)
 │   └── executeCodeBlocks: bool (currently ignored)
 │
 └── byo: BYOAgentSpec (if type=BYO)
@@ -126,6 +130,10 @@ ModelConfigSpec
 │   ├── caCertSecretKey: string
 │   └── disableSystemCAs: bool
 │
+├── retry: ModelRetryConfig
+│   └── attempts: int (max retries of failed LLM HTTP requests with exponential backoff;
+│                      OpenAI/AzureOpenAI/Anthropic/Gemini only)
+│
 ├── openAI: OpenAIConfig
 │   ├── baseUrl, temperature, maxTokens, topP
 │   ├── frequencyPenalty, presencePenalty
@@ -340,7 +348,8 @@ When adding a field to an existing CRD, update all layers:
 5. **Translator** — `go/core/internal/controller/translator/agent/adk_api_translator.go` (wire field into config)
 6. **Python ADK types** — `python/packages/kagent-adk/src/kagent/adk/types.py` (mirror Go types)
 7. **Python runtime** — Use the field in agent setup if it affects runtime behavior
-8. **Tests** — Translator unit tests (golden files), E2E tests
-9. **Helm values** — If exposed to users installing via Helm
+8. **Go runtime** — `go/adk/pkg/` (mirror runtime behavior for `runtime: go` agents)
+9. **Tests** — Translator unit tests (golden files), E2E tests
+10. **Helm values** — If exposed to users installing via Helm
 
 See [controller-reconciliation.md](controller-reconciliation.md) for the reconciliation flow and the kagent-dev skill for step-by-step examples.
diff --git a/go/adk/pkg/agent/agent.go b/go/adk/pkg/agent/agent.go
index 1aae3637d6..a14df02bec 100644
--- a/go/adk/pkg/agent/agent.go
+++ b/go/adk/pkg/agent/agent.go
@@ -228,6 +228,7 @@ func CreateLLM(ctx context.Context, m adk.Model, log logr.Logger) (adkmodel.LLM,
 		if err != nil {
 			return nil, fmt.Errorf("failed to build HTTP client for Gemini: %w", err)
 		}
+		warnIgnoredMaxRetries(log, m.BaseModel, "gemini")
 		return adkgemini.NewModel(ctx, modelName, &genai.ClientConfig{
 			APIKey:     apiKey,
 			HTTPClient: httpClient,
@@ -246,6 +247,7 @@ func CreateLLM(ctx context.Context, m adk.Model, log logr.Logger) (adkmodel.LLM,
 		if modelName == "" {
 			modelName = DefaultGeminiModel
 		}
+		warnIgnoredMaxRetries(log, m.BaseModel, "gemini_vertex_ai")
 		return adkgemini.NewModel(ctx, modelName, &genai.ClientConfig{
 			Backend:  genai.BackendVertexAI,
 			Project:  project,
@@ -278,6 +280,7 @@ func CreateLLM(ctx context.Context, m adk.Model, log logr.Logger) (adkmodel.LLM,
 			modelName = DefaultOllamaModel
 		}
 		// Create OllamaConfig with native SDK support for Ollama-specific options
+		warnIgnoredMaxRetries(log, m.BaseModel, "ollama")
 		cfg := &models.OllamaConfig{
 			TransportConfig: transportConfigFromBase(m.BaseModel, nil),
 			Model:           modelName,
@@ -299,6 +302,7 @@ func CreateLLM(ctx context.Context, m adk.Model, log logr.Logger) (adkmodel.LLM,
 			return nil, fmt.Errorf("bedrock requires a model name (e.g. anthropic.claude-3-sonnet-20240229-v1:0)")
 		}
 		// Use Bedrock Converse API for ALL models (including Anthropic)
+		warnIgnoredMaxRetries(log, m.BaseModel, "bedrock")
 		cfg := &models.BedrockConfig{
 			TransportConfig:              transportConfigFromBase(m.BaseModel, nil),
 			Model:                        modelName,
@@ -329,6 +333,7 @@ func CreateLLM(ctx context.Context, m adk.Model, log logr.Logger) (adkmodel.LLM,
 		return models.NewAnthropicVertexAIModelWithLogger(ctx, cfg, region, project, log)
 
 	case *adk.SAPAICore:
+		warnIgnoredMaxRetries(log, m.BaseModel, "sap_ai_core")
 		cfg := models.SAPAICoreConfig{
 			Model:         m.Model,
 			BaseUrl:       m.BaseUrl,
@@ -352,6 +357,15 @@ func transportConfigFromBase(b adk.BaseModel, timeout *int) models.TransportConf
 		TLSDisableSystemCAs:   b.TLSDisableSystemCAs,
 		APIKeyPassthrough:     b.APIKeyPassthrough,
 		Timeout:               timeout,
+		MaxRetries:            b.MaxRetries,
+	}
+}
+
+// warnIgnoredMaxRetries logs a warning when retry configuration is set on a
+// provider whose Go SDK does not support configurable HTTP retries.
+func warnIgnoredMaxRetries(log logr.Logger, b adk.BaseModel, provider string) {
+	if b.MaxRetries != nil {
+		log.Info("Model retry configuration (max_retries) is not supported for this provider in the Go runtime; ignoring", "provider", provider)
 	}
 }
 
diff --git a/go/adk/pkg/models/anthropic.go b/go/adk/pkg/models/anthropic.go
index 0dee13adcb..8ce2162e8f 100644
--- a/go/adk/pkg/models/anthropic.go
+++ b/go/adk/pkg/models/anthropic.go
@@ -65,6 +65,9 @@ func newAnthropicModelFromConfig(config *AnthropicConfig, apiKey string, logger
 	if config.BaseUrl != "" {
 		opts = append(opts, option.WithBaseURL(config.BaseUrl))
 	}
+	if config.MaxRetries != nil {
+		opts = append(opts, option.WithMaxRetries(*config.MaxRetries))
+	}
 
 	// Create HTTP client with TLS, custom headers, and timeout.
 	httpClient, err := BuildHTTPClient(config.TransportConfig)
@@ -95,6 +98,9 @@ func NewAnthropicVertexAIModelWithLogger(ctx context.Context, config *AnthropicC
 	opts := []option.RequestOption{
 		vertex.WithGoogleAuth(ctx, region, projectID),
 	}
+	if config.MaxRetries != nil {
+		opts = append(opts, option.WithMaxRetries(*config.MaxRetries))
+	}
 
 	// Create HTTP client with timeout, custom headers, TLS, and passthrough
 	httpClient, err := BuildHTTPClient(config.TransportConfig)
@@ -126,6 +132,9 @@ func NewAnthropicBedrockModelWithLogger(ctx context.Context, config *AnthropicCo
 			awsconfig.WithRegion(region),
 		),
 	}
+	if config.MaxRetries != nil {
+		opts = append(opts, option.WithMaxRetries(*config.MaxRetries))
+	}
 
 	// Create HTTP client with timeout, custom headers, TLS, and passthrough
 	httpClient, err := BuildHTTPClient(config.TransportConfig)
diff --git a/go/adk/pkg/models/base.go b/go/adk/pkg/models/base.go
index 572687bf2d..b9e9872d19 100644
--- a/go/adk/pkg/models/base.go
+++ b/go/adk/pkg/models/base.go
@@ -20,6 +20,7 @@ type TransportConfig struct {
 	TLSDisableSystemCAs   *bool
 	APIKeyPassthrough     bool
 	Timeout               *int // seconds; nil = defaultTimeout
+	MaxRetries            *int // HTTP retry attempts for transient failures; nil = SDK default
 }
 
 // BuildHTTPClient creates an http.Client with the full transport stack:
diff --git a/go/adk/pkg/models/openai.go b/go/adk/pkg/models/openai.go
index fc975cce00..4f959d2141 100644
--- a/go/adk/pkg/models/openai.go
+++ b/go/adk/pkg/models/openai.go
@@ -84,6 +84,9 @@ func newOpenAIModelFromConfig(config *OpenAIConfig, apiKey string, logger logr.L
 	if config.BaseUrl != "" {
 		opts = append(opts, option.WithBaseURL(config.BaseUrl))
 	}
+	if config.MaxRetries != nil {
+		opts = append(opts, option.WithMaxRetries(*config.MaxRetries))
+	}
 	httpClient, err := BuildHTTPClient(config.TransportConfig)
 	if err != nil {
 		return nil, err
@@ -123,6 +126,9 @@ func NewAzureOpenAIModelWithLogger(config *AzureOpenAIConfig, logger logr.Logger
 		option.WithQueryAdd("api-version", apiVersion),
 		option.WithMiddleware(azurePathRewriteMiddleware()),
 	}
+	if config.MaxRetries != nil {
+		opts = append(opts, option.WithMaxRetries(*config.MaxRetries))
+	}
 
 	if !config.APIKeyPassthrough {
 		apiKey := os.Getenv("AZURE_OPENAI_API_KEY")
diff --git a/go/adk/pkg/runner/adapter.go b/go/adk/pkg/runner/adapter.go
index 0441f778c0..f13bfa44c9 100644
--- a/go/adk/pkg/runner/adapter.go
+++ b/go/adk/pkg/runner/adapter.go
@@ -14,6 +14,8 @@ import (
 	"github.com/kagent-dev/kagent/go/api/adk"
 	adkmemory "google.golang.org/adk/memory"
 	adkplugin "google.golang.org/adk/plugin"
+	"google.golang.org/adk/plugin/loggingplugin"
+	"google.golang.org/adk/plugin/retryandreflect"
 	"google.golang.org/adk/runner"
 	adksession "google.golang.org/adk/session"
 	adktool "google.golang.org/adk/tool"
@@ -83,6 +85,12 @@ func CreateRunnerConfig(
 		}
 	}
 
+	reliabilityPlugins, err := buildReliabilityPlugins(agentConfig.Reliability, log)
+	if err != nil {
+		return runner.Config{}, nil, err
+	}
+	adkPlugins = append(adkPlugins, reliabilityPlugins...)
+
 	cfg := runner.Config{
 		AppName:        appName,
 		Agent:          adkAgent,
@@ -96,6 +104,49 @@ func CreateRunnerConfig(
 	return cfg, subagentSessionIDs, nil
 }
 
+// buildReliabilityPlugins translates the agent reliability configuration into
+// ADK plugins: debug logging, tool retry (reflect-and-retry), and a max LLM
+// calls limit.
+func buildReliabilityPlugins(r *adk.ReliabilityConfig, log logr.Logger) ([]*adkplugin.Plugin, error) {
+	if r == nil {
+		return nil, nil
+	}
+
+	var plugins []*adkplugin.Plugin
+
+	if r.DebugLogging != nil && *r.DebugLogging {
+		p, err := loggingplugin.New("kagent_debug_logging")
+		if err != nil {
+			return nil, fmt.Errorf("failed to create debug logging plugin: %w", err)
+		}
+		plugins = append(plugins, p)
+		log.Info("Debug logging enabled for agent")
+	}
+
+	if r.ToolRetries != nil && *r.ToolRetries > 0 {
+		p, err := retryandreflect.New(
+			retryandreflect.WithMaxRetries(*r.ToolRetries),
+			retryandreflect.WithErrorIfRetryExceeded(false),
+		)
+		if err != nil {
+			return nil, fmt.Errorf("failed to create tool retry plugin: %w", err)
+		}
+		plugins = append(plugins, p)
+		log.Info("Tool retry enabled for agent", "toolRetries", *r.ToolRetries)
+	}
+
+	if r.MaxLLMCalls != nil && *r.MaxLLMCalls > 0 {
+		p, err := newMaxLLMCallsPlugin(*r.MaxLLMCalls)
+		if err != nil {
+			return nil, fmt.Errorf("failed to create max LLM calls plugin: %w", err)
+		}
+		plugins = append(plugins, p)
+		log.Info("Max LLM calls limit enabled for agent", "maxLLMCalls", *r.MaxLLMCalls)
+	}
+
+	return plugins, nil
+}
+
 func buildTokenPropagationPlugin(ctx context.Context, log logr.Logger) (*sts.TokenPropagationPlugin, error) {
 	propagateToken := strings.EqualFold(strings.TrimSpace(os.Getenv("KAGENT_PROPAGATE_TOKEN")), "true")
 	stsWellKnownURI := strings.TrimSpace(os.Getenv("STS_WELL_KNOWN_URI"))
diff --git a/go/adk/pkg/runner/adapter_test.go b/go/adk/pkg/runner/adapter_test.go
new file mode 100644
index 0000000000..b15ab10679
--- /dev/null
+++ b/go/adk/pkg/runner/adapter_test.go
@@ -0,0 +1,128 @@
+package runner
+
+import (
+	"context"
+	"strings"
+	"testing"
+
+	"github.com/go-logr/logr"
+	"github.com/kagent-dev/kagent/go/api/adk"
+	"google.golang.org/adk/agent"
+	"google.golang.org/adk/session"
+	"google.golang.org/genai"
+)
+
+func TestBuildReliabilityPlugins(t *testing.T) {
+	tests := []struct {
+		name      string
+		config    *adk.ReliabilityConfig
+		wantNames []string
+	}{
+		{
+			name:      "nil config",
+			config:    nil,
+			wantNames: nil,
+		},
+		{
+			name:      "empty config",
+			config:    &adk.ReliabilityConfig{},
+			wantNames: nil,
+		},
+		{
+			name:      "debug logging only",
+			config:    &adk.ReliabilityConfig{DebugLogging: new(true)},
+			wantNames: []string{"kagent_debug_logging"},
+		},
+		{
+			name:      "debug logging false",
+			config:    &adk.ReliabilityConfig{DebugLogging: new(false)},
+			wantNames: nil,
+		},
+		{
+			name:      "tool retries only",
+			config:    &adk.ReliabilityConfig{ToolRetries: new(3)},
+			wantNames: []string{"RetryAndReflectPlugin"},
+		},
+		{
+			name:      "max llm calls only",
+			config:    &adk.ReliabilityConfig{MaxLLMCalls: new(10)},
+			wantNames: []string{"kagent_max_llm_calls"},
+		},
+		{
+			name: "all enabled",
+			config: &adk.ReliabilityConfig{
+				ToolRetries:  new(2),
+				MaxLLMCalls:  new(50),
+				DebugLogging: new(true),
+			},
+			wantNames: []string{"kagent_debug_logging", "RetryAndReflectPlugin", "kagent_max_llm_calls"},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			plugins, err := buildReliabilityPlugins(tt.config, logr.Discard())
+			if err != nil {
+				t.Fatalf("buildReliabilityPlugins() error = %v", err)
+			}
+			if len(plugins) != len(tt.wantNames) {
+				t.Fatalf("got %d plugins, want %d", len(plugins), len(tt.wantNames))
+			}
+			for i, want := range tt.wantNames {
+				if got := plugins[i].Name(); got != want {
+					t.Errorf("plugin[%d].Name() = %q, want %q", i, got, want)
+				}
+			}
+		})
+	}
+}
+
+// fakeCallbackContext is a minimal agent.CallbackContext for testing.
+type fakeCallbackContext struct {
+	context.Context
+	invocationID string
+}
+
+func (f *fakeCallbackContext) UserContent() *genai.Content          { return nil }
+func (f *fakeCallbackContext) InvocationID() string                 { return f.invocationID }
+func (f *fakeCallbackContext) AgentName() string                    { return "test-agent" }
+func (f *fakeCallbackContext) ReadonlyState() session.ReadonlyState { return nil }
+func (f *fakeCallbackContext) UserID() string                       { return "user" }
+func (f *fakeCallbackContext) AppName() string                      { return "app" }
+func (f *fakeCallbackContext) SessionID() string                    { return "session" }
+func (f *fakeCallbackContext) Branch() string                       { return "" }
+func (f *fakeCallbackContext) Artifacts() agent.Artifacts           { return nil }
+func (f *fakeCallbackContext) State() session.State                 { return nil }
+
+func TestMaxLLMCallsPlugin(t *testing.T) {
+	p, err := newMaxLLMCallsPlugin(2)
+	if err != nil {
+		t.Fatalf("newMaxLLMCallsPlugin() error = %v", err)
+	}
+	cb := p.BeforeModelCallback()
+	if cb == nil {
+		t.Fatal("BeforeModelCallback is nil")
+	}
+
+	ctxA := &fakeCallbackContext{Context: context.Background(), invocationID: "inv-a"}
+	ctxB := &fakeCallbackContext{Context: context.Background(), invocationID: "inv-b"}
+
+	// First two calls within the limit succeed.
+	for i := range 2 {
+		if _, err := cb(ctxA, nil); err != nil {
+			t.Fatalf("call %d: unexpected error: %v", i+1, err)
+		}
+	}
+
+	// Third call exceeds the limit.
+	if _, err := cb(ctxA, nil); err == nil {
+		t.Fatal("expected error after exceeding limit, got nil")
+	} else if !strings.Contains(err.Error(), "limit of 2 model calls") {
+		t.Errorf("unexpected error message: %v", err)
+	}
+
+	// A different invocation has its own counter.
+	if _, err := cb(ctxB, nil); err != nil {
+		t.Fatalf("different invocation should not be limited: %v", err)
+	}
+}
diff --git a/go/adk/pkg/runner/maxllmcalls.go b/go/adk/pkg/runner/maxllmcalls.go
new file mode 100644
index 0000000000..38b024d3f0
--- /dev/null
+++ b/go/adk/pkg/runner/maxllmcalls.go
@@ -0,0 +1,42 @@
+package runner
+
+import (
+	"fmt"
+	"sync"
+
+	"google.golang.org/adk/agent"
+	"google.golang.org/adk/model"
+	adkplugin "google.golang.org/adk/plugin"
+)
+
+// newMaxLLMCallsPlugin returns a plugin that enforces a limit on the number of
+// LLM calls within a single invocation. The Go ADK has no native equivalent of
+// the Python RunConfig.max_llm_calls, so this is implemented as a
+// BeforeModelCallback that counts model calls per invocation ID and aborts the
+// run when the limit is exceeded.
+func newMaxLLMCallsPlugin(limit int) (*adkplugin.Plugin, error) {
+	var mu sync.Mutex
+	counts := make(map[string]int)
+
+	return adkplugin.New(adkplugin.Config{
+		Name: "kagent_max_llm_calls",
+		BeforeModelCallback: func(ctx agent.CallbackContext, llmRequest *model.LLMRequest) (*model.LLMResponse, error) {
+			mu.Lock()
+			defer mu.Unlock()
+			id := ctx.InvocationID()
+			counts[id]++
+			if counts[id] > limit {
+				return nil, fmt.Errorf(
+					"agent stopped: exceeded the configured limit of %d model calls in a single run (reliability.maxLLMCalls)",
+					limit,
+				)
+			}
+			return nil, nil
+		},
+		AfterRunCallback: func(ictx agent.InvocationContext) {
+			mu.Lock()
+			defer mu.Unlock()
+			delete(counts, ictx.InvocationID())
+		},
+	})
+}
diff --git a/go/api/adk/types.go b/go/api/adk/types.go
index 4f15f9f899..6d739e0b46 100644
--- a/go/api/adk/types.go
+++ b/go/api/adk/types.go
@@ -61,6 +61,11 @@ type BaseModel struct {
 	// APIKeyPassthrough enables forwarding the Bearer token from incoming requests
 	// as the LLM API key instead of using a static secret.
 	APIKeyPassthrough bool `json:"api_key_passthrough,omitempty"`
+
+	// MaxRetries is the maximum number of times a failed LLM HTTP request
+	// (429 rate limit, 408 timeout, transient 5xx) is retried with
+	// exponential backoff by the provider SDK. Nil means SDK default.
+	MaxRetries *int `json:"max_retries,omitempty"`
 }
 
 // GDCHTokenExchangeConfig holds the GDCH-specific token exchange fields
@@ -442,6 +447,19 @@ type NetworkConfig struct {
 	AllowedDomains []string `json:"allowed_domains,omitempty"`
 }
 
+// ReliabilityConfig groups reliability-related configuration that flows
+// through config.json to the Python runtime.
+type ReliabilityConfig struct {
+	// ToolRetries is the maximum number of consecutive failures for a tool
+	// call before the runtime stops retrying it (reflect-and-retry).
+	ToolRetries *int `json:"tool_retries,omitempty"`
+	// MaxLLMCalls caps the total number of model calls per request.
+	MaxLLMCalls *int `json:"max_llm_calls,omitempty"`
+	// DebugLogging enables verbose logging of every LLM request/response
+	// and tool call.
+	DebugLogging *bool `json:"debug_logging,omitempty"`
+}
+
 // AgentContextConfig is the context management configuration that flows through config.json to the Python runtime.
 type AgentContextConfig struct {
 	Compaction *AgentCompressionConfig `json:"compaction,omitempty"`
@@ -497,6 +515,7 @@ type AgentConfig struct {
 	Memory        *MemoryConfig         `json:"memory,omitempty"`
 	Network       *NetworkConfig        `json:"network,omitempty"`
 	ContextConfig *AgentContextConfig   `json:"context_config,omitempty"`
+	Reliability   *ReliabilityConfig    `json:"reliability,omitempty"`
 }
 
 // GetStream returns the stream value or default if not set
@@ -528,6 +547,7 @@ func (a *AgentConfig) UnmarshalJSON(data []byte) error {
 		Memory        json.RawMessage       `json:"memory"`
 		Network       *NetworkConfig        `json:"network,omitempty"`
 		ContextConfig *AgentContextConfig   `json:"context_config,omitempty"`
+		Reliability   *ReliabilityConfig    `json:"reliability,omitempty"`
 	}
 	if err := json.Unmarshal(data, &tmp); err != nil {
 		return err
@@ -557,6 +577,7 @@ func (a *AgentConfig) UnmarshalJSON(data []byte) error {
 	a.Memory = memory
 	a.Network = tmp.Network
 	a.ContextConfig = tmp.ContextConfig
+	a.Reliability = tmp.Reliability
 	return nil
 }
 
diff --git a/go/api/config/crd/bases/kagent.dev_agents.yaml b/go/api/config/crd/bases/kagent.dev_agents.yaml
index 8338f8e182..4d4a50483f 100644
--- a/go/api/config/crd/bases/kagent.dev_agents.yaml
+++ b/go/api/config/crd/bases/kagent.dev_agents.yaml
@@ -13110,6 +13110,35 @@ spec:
                         maxItems: 20
                         type: array
                     type: object
+                  reliability:
+                    description: |-
+                      Reliability configures self-healing and observability behaviors for the
+                      agent runtime, such as reflect-and-retry on failed tool calls, model call
+                      caps, and debug logging.
+                    properties:
+                      debugLogging:
+                        description: |-
+                          DebugLogging enables verbose runtime logging of every LLM
+                          request/response and tool call to the agent pod logs.
+                          Useful for debugging agent behavior; off by default.
+                        type: boolean
+                      maxLLMCalls:
+                        description: |-
+                          MaxLLMCalls caps the total number of model calls per request (cost safety
+                          rail). When the cap is exceeded the run stops with a clear error instead
+                          of looping. If unset, the runtime default applies (500).
+                        minimum: 1
+                        type: integer
+                      toolRetries:
+                        description: |-
+                          ToolRetries is the maximum number of consecutive failures for a tool call
+                          before the agent stops retrying it. When set, failed tool calls are followed
+                          by structured reflection guidance injected into the model context so the
+                          agent can self-correct instead of repeating the same failing call.
+                        maximum: 10
+                        minimum: 1
+                        type: integer
+                    type: object
                   runtime:
                     default: python
                     description: |-
diff --git a/go/api/config/crd/bases/kagent.dev_modelconfigs.yaml b/go/api/config/crd/bases/kagent.dev_modelconfigs.yaml
index ce185907d9..966ea72a9c 100644
--- a/go/api/config/crd/bases/kagent.dev_modelconfigs.yaml
+++ b/go/api/config/crd/bases/kagent.dev_modelconfigs.yaml
@@ -628,6 +628,26 @@ spec:
                 - Bedrock
                 - SAPAICore
                 type: string
+              retry:
+                description: |-
+                  Retry configures automatic retries of failed LLM HTTP requests
+                  (e.g. 429 rate limits, transient 5xx errors, timeouts) with
+                  exponential backoff, handled by the provider SDK.
+                properties:
+                  attempts:
+                    description: |-
+                      Attempts is the maximum number of retry attempts after the initial
+                      request fails. Retried errors include rate limits (429), request
+                      timeouts (408), and transient server errors (5xx); backoff between
+                      attempts is exponential. Set to 0 to disable retries entirely.
+                      Supported for the OpenAI, AzureOpenAI, Anthropic, and Gemini providers;
+                      ignored by other providers.
+                    maximum: 20
+                    minimum: 0
+                    type: integer
+                required:
+                - attempts
+                type: object
               sapAICore:
                 description: SAP AI Core-specific configuration
                 properties:
diff --git a/go/api/config/crd/bases/kagent.dev_sandboxagents.yaml b/go/api/config/crd/bases/kagent.dev_sandboxagents.yaml
index 3f8f594b50..516904f45e 100644
--- a/go/api/config/crd/bases/kagent.dev_sandboxagents.yaml
+++ b/go/api/config/crd/bases/kagent.dev_sandboxagents.yaml
@@ -10767,6 +10767,35 @@ spec:
                         maxItems: 20
                         type: array
                     type: object
+                  reliability:
+                    description: |-
+                      Reliability configures self-healing and observability behaviors for the
+                      agent runtime, such as reflect-and-retry on failed tool calls, model call
+                      caps, and debug logging.
+                    properties:
+                      debugLogging:
+                        description: |-
+                          DebugLogging enables verbose runtime logging of every LLM
+                          request/response and tool call to the agent pod logs.
+                          Useful for debugging agent behavior; off by default.
+                        type: boolean
+                      maxLLMCalls:
+                        description: |-
+                          MaxLLMCalls caps the total number of model calls per request (cost safety
+                          rail). When the cap is exceeded the run stops with a clear error instead
+                          of looping. If unset, the runtime default applies (500).
+                        minimum: 1
+                        type: integer
+                      toolRetries:
+                        description: |-
+                          ToolRetries is the maximum number of consecutive failures for a tool call
+                          before the agent stops retrying it. When set, failed tool calls are followed
+                          by structured reflection guidance injected into the model context so the
+                          agent can self-correct instead of repeating the same failing call.
+                        maximum: 10
+                        minimum: 1
+                        type: integer
+                    type: object
                   runtime:
                     default: python
                     description: |-
diff --git a/go/api/v1alpha2/agent_types.go b/go/api/v1alpha2/agent_types.go
index ebfdcd7325..10556fc63d 100644
--- a/go/api/v1alpha2/agent_types.go
+++ b/go/api/v1alpha2/agent_types.go
@@ -228,6 +228,35 @@ type DeclarativeAgentSpec struct {
 	// This includes event compaction (compression) and context caching.
 	// +optional
 	Context *ContextConfig `json:"context,omitempty"`
+
+	// Reliability configures self-healing and observability behaviors for the
+	// agent runtime, such as reflect-and-retry on failed tool calls, model call
+	// caps, and debug logging.
+	// +optional
+	Reliability *ReliabilityConfig `json:"reliability,omitempty"`
+}
+
+// ReliabilityConfig configures self-healing and observability behaviors for the agent runtime.
+type ReliabilityConfig struct {
+	// ToolRetries is the maximum number of consecutive failures for a tool call
+	// before the agent stops retrying it. When set, failed tool calls are followed
+	// by structured reflection guidance injected into the model context so the
+	// agent can self-correct instead of repeating the same failing call.
+	// +optional
+	// +kubebuilder:validation:Minimum=1
+	// +kubebuilder:validation:Maximum=10
+	ToolRetries *int `json:"toolRetries,omitempty"`
+	// MaxLLMCalls caps the total number of model calls per request (cost safety
+	// rail). When the cap is exceeded the run stops with a clear error instead
+	// of looping. If unset, the runtime default applies (500).
+	// +optional
+	// +kubebuilder:validation:Minimum=1
+	MaxLLMCalls *int `json:"maxLLMCalls,omitempty"`
+	// DebugLogging enables verbose runtime logging of every LLM
+	// request/response and tool call to the agent pod logs.
+	// Useful for debugging agent behavior; off by default.
+	// +optional
+	DebugLogging bool `json:"debugLogging,omitempty"`
 }
 
 // SandboxPlatform selects the control plane for sandboxed agents.
diff --git a/go/api/v1alpha2/modelconfig_types.go b/go/api/v1alpha2/modelconfig_types.go
index 3536611744..11e9e62bc8 100644
--- a/go/api/v1alpha2/modelconfig_types.go
+++ b/go/api/v1alpha2/modelconfig_types.go
@@ -419,6 +419,26 @@ type ModelConfigSpec struct {
 	// that use self-signed certificates or custom certificate authorities.
 	// +optional
 	TLS *TLSConfig `json:"tls,omitempty"`
+
+	// Retry configures automatic retries of failed LLM HTTP requests
+	// (e.g. 429 rate limits, transient 5xx errors, timeouts) with
+	// exponential backoff, handled by the provider SDK.
+	// +optional
+	Retry *ModelRetryConfig `json:"retry,omitempty"`
+}
+
+// ModelRetryConfig configures automatic retries of failed LLM HTTP requests.
+type ModelRetryConfig struct {
+	// Attempts is the maximum number of retry attempts after the initial
+	// request fails. Retried errors include rate limits (429), request
+	// timeouts (408), and transient server errors (5xx); backoff between
+	// attempts is exponential. Set to 0 to disable retries entirely.
+	// Supported for the OpenAI, AzureOpenAI, Anthropic, and Gemini providers;
+	// ignored by other providers.
+	// +required
+	// +kubebuilder:validation:Minimum=0
+	// +kubebuilder:validation:Maximum=20
+	Attempts int `json:"attempts"`
 }
 
 // ModelConfigStatus defines the observed state of ModelConfig.
diff --git a/go/api/v1alpha2/zz_generated.deepcopy.go b/go/api/v1alpha2/zz_generated.deepcopy.go
index 2c03020b70..3d5d35c354 100644
--- a/go/api/v1alpha2/zz_generated.deepcopy.go
+++ b/go/api/v1alpha2/zz_generated.deepcopy.go
@@ -874,6 +874,11 @@ func (in *DeclarativeAgentSpec) DeepCopyInto(out *DeclarativeAgentSpec) {
 		*out = new(ContextConfig)
 		(*in).DeepCopyInto(*out)
 	}
+	if in.Reliability != nil {
+		in, out := &in.Reliability, &out.Reliability
+		*out = new(ReliabilityConfig)
+		(*in).DeepCopyInto(*out)
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeclarativeAgentSpec.
@@ -1143,6 +1148,11 @@ func (in *ModelConfigSpec) DeepCopyInto(out *ModelConfigSpec) {
 		*out = new(TLSConfig)
 		**out = **in
 	}
+	if in.Retry != nil {
+		in, out := &in.Retry, &out.Retry
+		*out = new(ModelRetryConfig)
+		**out = **in
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ModelConfigSpec.
@@ -1287,6 +1297,21 @@ func (in *ModelProviderConfigStatus) DeepCopy() *ModelProviderConfigStatus {
 	return out
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *ModelRetryConfig) DeepCopyInto(out *ModelRetryConfig) {
+	*out = *in
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ModelRetryConfig.
+func (in *ModelRetryConfig) DeepCopy() *ModelRetryConfig {
+	if in == nil {
+		return nil
+	}
+	out := new(ModelRetryConfig)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *NetworkConfig) DeepCopyInto(out *NetworkConfig) {
 	*out = *in
@@ -1405,6 +1430,31 @@ func (in *PromptTemplateSpec) DeepCopy() *PromptTemplateSpec {
 	return out
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *ReliabilityConfig) DeepCopyInto(out *ReliabilityConfig) {
+	*out = *in
+	if in.ToolRetries != nil {
+		in, out := &in.ToolRetries, &out.ToolRetries
+		*out = new(int)
+		**out = **in
+	}
+	if in.MaxLLMCalls != nil {
+		in, out := &in.MaxLLMCalls, &out.MaxLLMCalls
+		*out = new(int)
+		**out = **in
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ReliabilityConfig.
+func (in *ReliabilityConfig) DeepCopy() *ReliabilityConfig {
+	if in == nil {
+		return nil
+	}
+	out := new(ReliabilityConfig)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *RemoteMCPServer) DeepCopyInto(out *RemoteMCPServer) {
 	*out = *in
diff --git a/go/core/internal/controller/translator/agent/adk_api_translator.go b/go/core/internal/controller/translator/agent/adk_api_translator.go
index 61e2892f7f..45f30bbfe0 100644
--- a/go/core/internal/controller/translator/agent/adk_api_translator.go
+++ b/go/core/internal/controller/translator/agent/adk_api_translator.go
@@ -310,14 +310,19 @@ func deriveTLSFields(tlsConfig *v1alpha2.TLSConfig) (*bool, *string, *bool) {
 	return insecureSkipVerify, caCertPath, disableSystemCAs
 }
 
-// populateTLSFields writes the derived TLS fields onto an adk.BaseModel.
-// Used by every model-provider branch in translateBaseModel — each provider
-// embeds BaseModel, so this single call replaces the explicit three-field
+// populateBaseModelFields writes the provider-agnostic ModelConfig fields
+// (TLS and retry) onto an adk.BaseModel.
+// Used by every model-provider branch in translateModel — each provider
+// embeds BaseModel, so this single call replaces the explicit per-field
 // assignment at each site. The MCP-connection params (StreamableHTTPConnectionParams,
-// SseConnectionParams) carry the same three fields but do not embed BaseModel;
+// SseConnectionParams) carry the same three TLS fields but do not embed BaseModel;
 // those callers assign through deriveTLSFields directly.
-func populateTLSFields(baseModel *adk.BaseModel, tlsConfig *v1alpha2.TLSConfig) {
-	baseModel.TLSInsecureSkipVerify, baseModel.TLSCACertPath, baseModel.TLSDisableSystemCAs = deriveTLSFields(tlsConfig)
+func populateBaseModelFields(baseModel *adk.BaseModel, spec *v1alpha2.ModelConfigSpec) {
+	baseModel.TLSInsecureSkipVerify, baseModel.TLSCACertPath, baseModel.TLSDisableSystemCAs = deriveTLSFields(spec.TLS)
+	if spec.Retry != nil {
+		attempts := spec.Retry.Attempts
+		baseModel.MaxRetries = &attempts
+	}
 }
 
 // addTLSConfiguration mounts a CA Secret as a per-Secret read-only volume on
@@ -471,8 +476,8 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC
 				Headers: model.Spec.DefaultHeaders,
 			},
 		}
-		// Populate TLS fields in BaseModel
-		populateTLSFields(&openai.BaseModel, model.Spec.TLS)
+		// Populate shared fields (TLS, retry) in BaseModel
+		populateBaseModelFields(&openai.BaseModel, &model.Spec)
 		// Populate TokenExchange fields (OpenAI-specific)
 		addTokenExchangeConfiguration(openai, modelDeploymentData, &model.Spec)
 		openai.APIKeyPassthrough = model.Spec.APIKeyPassthrough
@@ -529,8 +534,8 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC
 				Headers: model.Spec.DefaultHeaders,
 			},
 		}
-		// Populate TLS fields in BaseModel
-		populateTLSFields(&anthropic.BaseModel, model.Spec.TLS)
+		// Populate shared fields (TLS, retry) in BaseModel
+		populateBaseModelFields(&anthropic.BaseModel, &model.Spec)
 		anthropic.APIKeyPassthrough = model.Spec.APIKeyPassthrough
 
 		if model.Spec.Anthropic != nil {
@@ -587,8 +592,8 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC
 				Headers: model.Spec.DefaultHeaders,
 			},
 		}
-		// Populate TLS fields in BaseModel
-		populateTLSFields(&azureOpenAI.BaseModel, model.Spec.TLS)
+		// Populate shared fields (TLS, retry) in BaseModel
+		populateBaseModelFields(&azureOpenAI.BaseModel, &model.Spec)
 		azureOpenAI.APIKeyPassthrough = model.Spec.APIKeyPassthrough
 
 		return azureOpenAI, modelDeploymentData, secretHashBytes, nil
@@ -632,8 +637,8 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC
 				Headers: model.Spec.DefaultHeaders,
 			},
 		}
-		// Populate TLS fields in BaseModel
-		populateTLSFields(&gemini.BaseModel, model.Spec.TLS)
+		// Populate shared fields (TLS, retry) in BaseModel
+		populateBaseModelFields(&gemini.BaseModel, &model.Spec)
 		gemini.APIKeyPassthrough = model.Spec.APIKeyPassthrough
 
 		return gemini, modelDeploymentData, secretHashBytes, nil
@@ -673,8 +678,8 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC
 				Headers: model.Spec.DefaultHeaders,
 			},
 		}
-		// Populate TLS fields in BaseModel
-		populateTLSFields(&anthropic.BaseModel, model.Spec.TLS)
+		// Populate shared fields (TLS, retry) in BaseModel
+		populateBaseModelFields(&anthropic.BaseModel, &model.Spec)
 		anthropic.APIKeyPassthrough = model.Spec.APIKeyPassthrough
 
 		return anthropic, modelDeploymentData, secretHashBytes, nil
@@ -697,8 +702,8 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC
 			},
 			Options: model.Spec.Ollama.Options,
 		}
-		// Populate TLS fields in BaseModel
-		populateTLSFields(&ollama.BaseModel, model.Spec.TLS)
+		// Populate shared fields (TLS, retry) in BaseModel
+		populateBaseModelFields(&ollama.BaseModel, &model.Spec)
 		ollama.APIKeyPassthrough = model.Spec.APIKeyPassthrough
 
 		return ollama, modelDeploymentData, secretHashBytes, nil
@@ -720,8 +725,8 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC
 				Headers: model.Spec.DefaultHeaders,
 			},
 		}
-		// Populate TLS fields in BaseModel
-		populateTLSFields(&gemini.BaseModel, model.Spec.TLS)
+		// Populate shared fields (TLS, retry) in BaseModel
+		populateBaseModelFields(&gemini.BaseModel, &model.Spec)
 		return gemini, modelDeploymentData, secretHashBytes, nil
 	case v1alpha2.ModelProviderBedrock:
 		if model.Spec.Bedrock == nil {
@@ -808,8 +813,8 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC
 			AdditionalModelRequestFields: additionalFields,
 		}
 
-		// Populate TLS fields in BaseModel
-		populateTLSFields(&bedrock.BaseModel, model.Spec.TLS)
+		// Populate shared fields (TLS, retry) in BaseModel
+		populateBaseModelFields(&bedrock.BaseModel, &model.Spec)
 		bedrock.APIKeyPassthrough = model.Spec.APIKeyPassthrough
 
 		return bedrock, modelDeploymentData, secretHashBytes, nil
@@ -858,7 +863,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC
 			AuthUrl:       model.Spec.SAPAICore.AuthURL,
 		}
 
-		populateTLSFields(&sapAICore.BaseModel, model.Spec.TLS)
+		populateBaseModelFields(&sapAICore.BaseModel, &model.Spec)
 		sapAICore.APIKeyPassthrough = model.Spec.APIKeyPassthrough
 
 		return sapAICore, modelDeploymentData, secretHashBytes, nil
diff --git a/go/core/internal/controller/translator/agent/compiler.go b/go/core/internal/controller/translator/agent/compiler.go
index 9596e28db5..a6da613c65 100644
--- a/go/core/internal/controller/translator/agent/compiler.go
+++ b/go/core/internal/controller/translator/agent/compiler.go
@@ -187,6 +187,19 @@ func (a *adkApiTranslator) translateInlineAgent(ctx context.Context, agent v1alp
 		}
 	}
 
+	// Translate reliability configuration (reflect-and-retry on tool failures,
+	// LLM call cap, debug logging).
+	if r := spec.Declarative.Reliability; r != nil && (r.ToolRetries != nil || r.MaxLLMCalls != nil || r.DebugLogging) {
+		cfg.Reliability = &adk.ReliabilityConfig{
+			ToolRetries: r.ToolRetries,
+			MaxLLMCalls: r.MaxLLMCalls,
+		}
+		if r.DebugLogging {
+			debugLogging := true
+			cfg.Reliability.DebugLogging = &debugLogging
+		}
+	}
+
 	// Translate context management configuration
 	if spec.Declarative.Context != nil {
 		contextCfg := &adk.AgentContextConfig{}
diff --git a/go/core/internal/controller/translator/agent/testdata/inputs/agent_with_reliability.yaml b/go/core/internal/controller/translator/agent/testdata/inputs/agent_with_reliability.yaml
new file mode 100644
index 0000000000..9ca36a105d
--- /dev/null
+++ b/go/core/internal/controller/translator/agent/testdata/inputs/agent_with_reliability.yaml
@@ -0,0 +1,37 @@
+operation: translateAgent
+targetObject: basic-agent
+namespace: test
+objects:
+  - apiVersion: v1
+    kind: Secret
+    metadata:
+      name: openai-secret
+      namespace: test
+    data:
+      api-key: c2stdGVzdC1hcGkta2V5  # base64 encoded "sk-test-api-key"
+  - apiVersion: kagent.dev/v1alpha2
+    kind: ModelConfig
+    metadata:
+      name: basic-model
+      namespace: test
+    spec:
+      provider: OpenAI
+      model: gpt-4o
+      apiKeySecret: openai-secret
+      apiKeySecretKey: api-key
+  - apiVersion: kagent.dev/v1alpha2
+    kind: Agent
+    metadata:
+      name: basic-agent
+      namespace: test
+    spec:
+      type: Declarative
+      declarative:
+        description: A test agent with reliability and debug logging enabled
+        systemMessage: You are a helpful assistant.
+        modelConfig: basic-model
+        reliability:
+          toolRetries: 3
+          maxLLMCalls: 25
+          debugLogging: true
+        tools: []
diff --git a/go/core/internal/controller/translator/agent/testdata/inputs/modelconfig_with_retry.yaml b/go/core/internal/controller/translator/agent/testdata/inputs/modelconfig_with_retry.yaml
new file mode 100644
index 0000000000..baf8fd4341
--- /dev/null
+++ b/go/core/internal/controller/translator/agent/testdata/inputs/modelconfig_with_retry.yaml
@@ -0,0 +1,35 @@
+operation: translateAgent
+targetObject: basic-agent
+namespace: test
+objects:
+  - apiVersion: v1
+    kind: Secret
+    metadata:
+      name: openai-secret
+      namespace: test
+    data:
+      api-key: c2stdGVzdC1hcGkta2V5  # base64 encoded "sk-test-api-key"
+  - apiVersion: kagent.dev/v1alpha2
+    kind: ModelConfig
+    metadata:
+      name: retry-model
+      namespace: test
+    spec:
+      provider: OpenAI
+      model: gpt-4o
+      apiKeySecret: openai-secret
+      apiKeySecretKey: api-key
+      retry:
+        attempts: 5
+  - apiVersion: kagent.dev/v1alpha2
+    kind: Agent
+    metadata:
+      name: basic-agent
+      namespace: test
+    spec:
+      type: Declarative
+      declarative:
+        description: A test agent whose model retries failed LLM HTTP requests
+        systemMessage: You are a helpful assistant.
+        modelConfig: retry-model
+        tools: []
diff --git a/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_reliability.json b/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_reliability.json
new file mode 100644
index 0000000000..91b984f2af
--- /dev/null
+++ b/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_reliability.json
@@ -0,0 +1,299 @@
+{
+  "agentCard": {
+    "capabilities": {
+      "streaming": true
+    },
+    "defaultInputModes": [
+      "text"
+    ],
+    "defaultOutputModes": [
+      "text"
+    ],
+    "description": "",
+    "name": "basic_agent",
+    "skills": null,
+    "supportedInterfaces": [
+      {
+        "protocolBinding": "JSONRPC",
+        "protocolVersion": "0.3",
+        "url": "http://basic-agent.test:8080"
+      },
+      {
+        "protocolBinding": "JSONRPC",
+        "protocolVersion": "1.0",
+        "url": "http://basic-agent.test:8080"
+      }
+    ],
+    "version": ""
+  },
+  "config": {
+    "description": "",
+    "instruction": "You are a helpful assistant.",
+    "model": {
+      "base_url": "",
+      "model": "gpt-4o",
+      "type": "openai"
+    },
+    "reliability": {
+      "debug_logging": true,
+      "max_llm_calls": 25,
+      "tool_retries": 3
+    },
+    "stream": false
+  },
+  "manifest": [
+    {
+      "apiVersion": "v1",
+      "kind": "Secret",
+      "metadata": {
+        "labels": {
+          "app": "kagent",
+          "app.kubernetes.io/managed-by": "kagent",
+          "app.kubernetes.io/name": "basic-agent",
+          "app.kubernetes.io/part-of": "kagent",
+          "kagent": "basic-agent"
+        },
+        "name": "basic-agent",
+        "namespace": "test",
+        "ownerReferences": [
+          {
+            "apiVersion": "kagent.dev/v1alpha2",
+            "blockOwnerDeletion": true,
+            "controller": true,
+            "kind": "Agent",
+            "name": "basic-agent",
+            "uid": ""
+          }
+        ]
+      },
+      "stringData": {
+        "agent-card.json": "{\n  \"defaultInputModes\": [\n    \"text\"\n  ],\n  \"defaultOutputModes\": [\n    \"text\"\n  ],\n  \"description\": \"\",\n  \"name\": \"basic_agent\",\n  \"version\": \"\",\n  \"skills\": [],\n  \"capabilities\": {\n    \"streaming\": true\n  },\n  \"supportedInterfaces\": [\n    {\n      \"url\": \"http://basic-agent.test:8080\",\n      \"protocolBinding\": \"JSONRPC\",\n      \"protocolVersion\": \"0.3\"\n    },\n    {\n      \"url\": \"http://basic-agent.test:8080\",\n      \"protocolBinding\": \"JSONRPC\",\n      \"protocolVersion\": \"1.0\"\n    }\n  ],\n  \"url\": \"http://basic-agent.test:8080\",\n  \"protocolVersion\": \"0.3\",\n  \"preferredTransport\": \"JSONRPC\"\n}",
+        "config.json": "{\"model\":{\"type\":\"openai\",\"model\":\"gpt-4o\",\"base_url\":\"\"},\"description\":\"\",\"instruction\":\"You are a helpful assistant.\",\"stream\":false,\"reliability\":{\"tool_retries\":3,\"max_llm_calls\":25,\"debug_logging\":true}}"
+      }
+    },
+    {
+      "apiVersion": "v1",
+      "kind": "ServiceAccount",
+      "metadata": {
+        "labels": {
+          "app": "kagent",
+          "app.kubernetes.io/managed-by": "kagent",
+          "app.kubernetes.io/name": "basic-agent",
+          "app.kubernetes.io/part-of": "kagent",
+          "kagent": "basic-agent"
+        },
+        "name": "basic-agent",
+        "namespace": "test",
+        "ownerReferences": [
+          {
+            "apiVersion": "kagent.dev/v1alpha2",
+            "blockOwnerDeletion": true,
+            "controller": true,
+            "kind": "Agent",
+            "name": "basic-agent",
+            "uid": ""
+          }
+        ]
+      }
+    },
+    {
+      "apiVersion": "apps/v1",
+      "kind": "Deployment",
+      "metadata": {
+        "labels": {
+          "app": "kagent",
+          "app.kubernetes.io/managed-by": "kagent",
+          "app.kubernetes.io/name": "basic-agent",
+          "app.kubernetes.io/part-of": "kagent",
+          "kagent": "basic-agent"
+        },
+        "name": "basic-agent",
+        "namespace": "test",
+        "ownerReferences": [
+          {
+            "apiVersion": "kagent.dev/v1alpha2",
+            "blockOwnerDeletion": true,
+            "controller": true,
+            "kind": "Agent",
+            "name": "basic-agent",
+            "uid": ""
+          }
+        ]
+      },
+      "spec": {
+        "selector": {
+          "matchLabels": {
+            "app": "kagent",
+            "kagent": "basic-agent"
+          }
+        },
+        "strategy": {
+          "rollingUpdate": {
+            "maxSurge": 1,
+            "maxUnavailable": 0
+          },
+          "type": "RollingUpdate"
+        },
+        "template": {
+          "metadata": {
+            "annotations": {
+              "kagent.dev/config-hash": "17561973486592316877"
+            },
+            "labels": {
+              "app": "kagent",
+              "app.kubernetes.io/managed-by": "kagent",
+              "app.kubernetes.io/name": "basic-agent",
+              "app.kubernetes.io/part-of": "kagent",
+              "kagent": "basic-agent"
+            }
+          },
+          "spec": {
+            "containers": [
+              {
+                "args": [
+                  "--host",
+                  "0.0.0.0",
+                  "--port",
+                  "8080",
+                  "--filepath",
+                  "/config"
+                ],
+                "env": [
+                  {
+                    "name": "OPENAI_API_KEY",
+                    "valueFrom": {
+                      "secretKeyRef": {
+                        "key": "api-key",
+                        "name": "openai-secret"
+                      }
+                    }
+                  },
+                  {
+                    "name": "KAGENT_NAMESPACE",
+                    "valueFrom": {
+                      "fieldRef": {
+                        "fieldPath": "metadata.namespace"
+                      }
+                    }
+                  },
+                  {
+                    "name": "KAGENT_NAME",
+                    "value": "basic-agent"
+                  },
+                  {
+                    "name": "KAGENT_URL",
+                    "value": "http://kagent-controller.kagent:8083"
+                  }
+                ],
+                "image": "cr.kagent.dev/kagent-dev/kagent/app@sha256:test-app",
+                "imagePullPolicy": "IfNotPresent",
+                "name": "kagent",
+                "ports": [
+                  {
+                    "containerPort": 8080,
+                    "name": "http"
+                  }
+                ],
+                "readinessProbe": {
+                  "httpGet": {
+                    "path": "/.well-known/agent-card.json",
+                    "port": "http"
+                  },
+                  "initialDelaySeconds": 15,
+                  "periodSeconds": 15,
+                  "timeoutSeconds": 15
+                },
+                "resources": {
+                  "limits": {
+                    "cpu": "2",
+                    "memory": "1Gi"
+                  },
+                  "requests": {
+                    "cpu": "100m",
+                    "memory": "384Mi"
+                  }
+                },
+                "volumeMounts": [
+                  {
+                    "mountPath": "/config",
+                    "name": "config"
+                  },
+                  {
+                    "mountPath": "/var/run/secrets/tokens",
+                    "name": "kagent-token"
+                  }
+                ]
+              }
+            ],
+            "serviceAccountName": "basic-agent",
+            "volumes": [
+              {
+                "name": "config",
+                "secret": {
+                  "secretName": "basic-agent"
+                }
+              },
+              {
+                "name": "kagent-token",
+                "projected": {
+                  "sources": [
+                    {
+                      "serviceAccountToken": {
+                        "audience": "kagent",
+                        "expirationSeconds": 3600,
+                        "path": "kagent-token"
+                      }
+                    }
+                  ]
+                }
+              }
+            ]
+          }
+        }
+      },
+      "status": {}
+    },
+    {
+      "apiVersion": "v1",
+      "kind": "Service",
+      "metadata": {
+        "labels": {
+          "app": "kagent",
+          "app.kubernetes.io/managed-by": "kagent",
+          "app.kubernetes.io/name": "basic-agent",
+          "app.kubernetes.io/part-of": "kagent",
+          "kagent": "basic-agent"
+        },
+        "name": "basic-agent",
+        "namespace": "test",
+        "ownerReferences": [
+          {
+            "apiVersion": "kagent.dev/v1alpha2",
+            "blockOwnerDeletion": true,
+            "controller": true,
+            "kind": "Agent",
+            "name": "basic-agent",
+            "uid": ""
+          }
+        ]
+      },
+      "spec": {
+        "ports": [
+          {
+            "name": "http",
+            "port": 8080,
+            "targetPort": 8080
+          }
+        ],
+        "selector": {
+          "app": "kagent",
+          "kagent": "basic-agent"
+        },
+        "type": "ClusterIP"
+      },
+      "status": {
+        "loadBalancer": {}
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/go/core/internal/controller/translator/agent/testdata/outputs/modelconfig_with_retry.json b/go/core/internal/controller/translator/agent/testdata/outputs/modelconfig_with_retry.json
new file mode 100644
index 0000000000..6acedd5e26
--- /dev/null
+++ b/go/core/internal/controller/translator/agent/testdata/outputs/modelconfig_with_retry.json
@@ -0,0 +1,295 @@
+{
+  "agentCard": {
+    "capabilities": {
+      "streaming": true
+    },
+    "defaultInputModes": [
+      "text"
+    ],
+    "defaultOutputModes": [
+      "text"
+    ],
+    "description": "",
+    "name": "basic_agent",
+    "skills": null,
+    "supportedInterfaces": [
+      {
+        "protocolBinding": "JSONRPC",
+        "protocolVersion": "0.3",
+        "url": "http://basic-agent.test:8080"
+      },
+      {
+        "protocolBinding": "JSONRPC",
+        "protocolVersion": "1.0",
+        "url": "http://basic-agent.test:8080"
+      }
+    ],
+    "version": ""
+  },
+  "config": {
+    "description": "",
+    "instruction": "You are a helpful assistant.",
+    "model": {
+      "base_url": "",
+      "max_retries": 5,
+      "model": "gpt-4o",
+      "type": "openai"
+    },
+    "stream": false
+  },
+  "manifest": [
+    {
+      "apiVersion": "v1",
+      "kind": "Secret",
+      "metadata": {
+        "labels": {
+          "app": "kagent",
+          "app.kubernetes.io/managed-by": "kagent",
+          "app.kubernetes.io/name": "basic-agent",
+          "app.kubernetes.io/part-of": "kagent",
+          "kagent": "basic-agent"
+        },
+        "name": "basic-agent",
+        "namespace": "test",
+        "ownerReferences": [
+          {
+            "apiVersion": "kagent.dev/v1alpha2",
+            "blockOwnerDeletion": true,
+            "controller": true,
+            "kind": "Agent",
+            "name": "basic-agent",
+            "uid": ""
+          }
+        ]
+      },
+      "stringData": {
+        "agent-card.json": "{\n  \"defaultInputModes\": [\n    \"text\"\n  ],\n  \"defaultOutputModes\": [\n    \"text\"\n  ],\n  \"description\": \"\",\n  \"name\": \"basic_agent\",\n  \"version\": \"\",\n  \"skills\": [],\n  \"capabilities\": {\n    \"streaming\": true\n  },\n  \"supportedInterfaces\": [\n    {\n      \"url\": \"http://basic-agent.test:8080\",\n      \"protocolBinding\": \"JSONRPC\",\n      \"protocolVersion\": \"0.3\"\n    },\n    {\n      \"url\": \"http://basic-agent.test:8080\",\n      \"protocolBinding\": \"JSONRPC\",\n      \"protocolVersion\": \"1.0\"\n    }\n  ],\n  \"url\": \"http://basic-agent.test:8080\",\n  \"protocolVersion\": \"0.3\",\n  \"preferredTransport\": \"JSONRPC\"\n}",
+        "config.json": "{\"model\":{\"type\":\"openai\",\"model\":\"gpt-4o\",\"max_retries\":5,\"base_url\":\"\"},\"description\":\"\",\"instruction\":\"You are a helpful assistant.\",\"stream\":false}"
+      }
+    },
+    {
+      "apiVersion": "v1",
+      "kind": "ServiceAccount",
+      "metadata": {
+        "labels": {
+          "app": "kagent",
+          "app.kubernetes.io/managed-by": "kagent",
+          "app.kubernetes.io/name": "basic-agent",
+          "app.kubernetes.io/part-of": "kagent",
+          "kagent": "basic-agent"
+        },
+        "name": "basic-agent",
+        "namespace": "test",
+        "ownerReferences": [
+          {
+            "apiVersion": "kagent.dev/v1alpha2",
+            "blockOwnerDeletion": true,
+            "controller": true,
+            "kind": "Agent",
+            "name": "basic-agent",
+            "uid": ""
+          }
+        ]
+      }
+    },
+    {
+      "apiVersion": "apps/v1",
+      "kind": "Deployment",
+      "metadata": {
+        "labels": {
+          "app": "kagent",
+          "app.kubernetes.io/managed-by": "kagent",
+          "app.kubernetes.io/name": "basic-agent",
+          "app.kubernetes.io/part-of": "kagent",
+          "kagent": "basic-agent"
+        },
+        "name": "basic-agent",
+        "namespace": "test",
+        "ownerReferences": [
+          {
+            "apiVersion": "kagent.dev/v1alpha2",
+            "blockOwnerDeletion": true,
+            "controller": true,
+            "kind": "Agent",
+            "name": "basic-agent",
+            "uid": ""
+          }
+        ]
+      },
+      "spec": {
+        "selector": {
+          "matchLabels": {
+            "app": "kagent",
+            "kagent": "basic-agent"
+          }
+        },
+        "strategy": {
+          "rollingUpdate": {
+            "maxSurge": 1,
+            "maxUnavailable": 0
+          },
+          "type": "RollingUpdate"
+        },
+        "template": {
+          "metadata": {
+            "annotations": {
+              "kagent.dev/config-hash": "17753135717083412249"
+            },
+            "labels": {
+              "app": "kagent",
+              "app.kubernetes.io/managed-by": "kagent",
+              "app.kubernetes.io/name": "basic-agent",
+              "app.kubernetes.io/part-of": "kagent",
+              "kagent": "basic-agent"
+            }
+          },
+          "spec": {
+            "containers": [
+              {
+                "args": [
+                  "--host",
+                  "0.0.0.0",
+                  "--port",
+                  "8080",
+                  "--filepath",
+                  "/config"
+                ],
+                "env": [
+                  {
+                    "name": "OPENAI_API_KEY",
+                    "valueFrom": {
+                      "secretKeyRef": {
+                        "key": "api-key",
+                        "name": "openai-secret"
+                      }
+                    }
+                  },
+                  {
+                    "name": "KAGENT_NAMESPACE",
+                    "valueFrom": {
+                      "fieldRef": {
+                        "fieldPath": "metadata.namespace"
+                      }
+                    }
+                  },
+                  {
+                    "name": "KAGENT_NAME",
+                    "value": "basic-agent"
+                  },
+                  {
+                    "name": "KAGENT_URL",
+                    "value": "http://kagent-controller.kagent:8083"
+                  }
+                ],
+                "image": "cr.kagent.dev/kagent-dev/kagent/app@sha256:test-app",
+                "imagePullPolicy": "IfNotPresent",
+                "name": "kagent",
+                "ports": [
+                  {
+                    "containerPort": 8080,
+                    "name": "http"
+                  }
+                ],
+                "readinessProbe": {
+                  "httpGet": {
+                    "path": "/.well-known/agent-card.json",
+                    "port": "http"
+                  },
+                  "initialDelaySeconds": 15,
+                  "periodSeconds": 15,
+                  "timeoutSeconds": 15
+                },
+                "resources": {
+                  "limits": {
+                    "cpu": "2",
+                    "memory": "1Gi"
+                  },
+                  "requests": {
+                    "cpu": "100m",
+                    "memory": "384Mi"
+                  }
+                },
+                "volumeMounts": [
+                  {
+                    "mountPath": "/config",
+                    "name": "config"
+                  },
+                  {
+                    "mountPath": "/var/run/secrets/tokens",
+                    "name": "kagent-token"
+                  }
+                ]
+              }
+            ],
+            "serviceAccountName": "basic-agent",
+            "volumes": [
+              {
+                "name": "config",
+                "secret": {
+                  "secretName": "basic-agent"
+                }
+              },
+              {
+                "name": "kagent-token",
+                "projected": {
+                  "sources": [
+                    {
+                      "serviceAccountToken": {
+                        "audience": "kagent",
+                        "expirationSeconds": 3600,
+                        "path": "kagent-token"
+                      }
+                    }
+                  ]
+                }
+              }
+            ]
+          }
+        }
+      },
+      "status": {}
+    },
+    {
+      "apiVersion": "v1",
+      "kind": "Service",
+      "metadata": {
+        "labels": {
+          "app": "kagent",
+          "app.kubernetes.io/managed-by": "kagent",
+          "app.kubernetes.io/name": "basic-agent",
+          "app.kubernetes.io/part-of": "kagent",
+          "kagent": "basic-agent"
+        },
+        "name": "basic-agent",
+        "namespace": "test",
+        "ownerReferences": [
+          {
+            "apiVersion": "kagent.dev/v1alpha2",
+            "blockOwnerDeletion": true,
+            "controller": true,
+            "kind": "Agent",
+            "name": "basic-agent",
+            "uid": ""
+          }
+        ]
+      },
+      "spec": {
+        "ports": [
+          {
+            "name": "http",
+            "port": 8080,
+            "targetPort": 8080
+          }
+        ],
+        "selector": {
+          "app": "kagent",
+          "kagent": "basic-agent"
+        },
+        "type": "ClusterIP"
+      },
+      "status": {
+        "loadBalancer": {}
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/helm/kagent-crds/templates/kagent.dev_agents.yaml b/helm/kagent-crds/templates/kagent.dev_agents.yaml
index 8338f8e182..4d4a50483f 100644
--- a/helm/kagent-crds/templates/kagent.dev_agents.yaml
+++ b/helm/kagent-crds/templates/kagent.dev_agents.yaml
@@ -13110,6 +13110,35 @@ spec:
                         maxItems: 20
                         type: array
                     type: object
+                  reliability:
+                    description: |-
+                      Reliability configures self-healing and observability behaviors for the
+                      agent runtime, such as reflect-and-retry on failed tool calls, model call
+                      caps, and debug logging.
+                    properties:
+                      debugLogging:
+                        description: |-
+                          DebugLogging enables verbose runtime logging of every LLM
+                          request/response and tool call to the agent pod logs.
+                          Useful for debugging agent behavior; off by default.
+                        type: boolean
+                      maxLLMCalls:
+                        description: |-
+                          MaxLLMCalls caps the total number of model calls per request (cost safety
+                          rail). When the cap is exceeded the run stops with a clear error instead
+                          of looping. If unset, the runtime default applies (500).
+                        minimum: 1
+                        type: integer
+                      toolRetries:
+                        description: |-
+                          ToolRetries is the maximum number of consecutive failures for a tool call
+                          before the agent stops retrying it. When set, failed tool calls are followed
+                          by structured reflection guidance injected into the model context so the
+                          agent can self-correct instead of repeating the same failing call.
+                        maximum: 10
+                        minimum: 1
+                        type: integer
+                    type: object
                   runtime:
                     default: python
                     description: |-
diff --git a/helm/kagent-crds/templates/kagent.dev_modelconfigs.yaml b/helm/kagent-crds/templates/kagent.dev_modelconfigs.yaml
index ce185907d9..966ea72a9c 100644
--- a/helm/kagent-crds/templates/kagent.dev_modelconfigs.yaml
+++ b/helm/kagent-crds/templates/kagent.dev_modelconfigs.yaml
@@ -628,6 +628,26 @@ spec:
                 - Bedrock
                 - SAPAICore
                 type: string
+              retry:
+                description: |-
+                  Retry configures automatic retries of failed LLM HTTP requests
+                  (e.g. 429 rate limits, transient 5xx errors, timeouts) with
+                  exponential backoff, handled by the provider SDK.
+                properties:
+                  attempts:
+                    description: |-
+                      Attempts is the maximum number of retry attempts after the initial
+                      request fails. Retried errors include rate limits (429), request
+                      timeouts (408), and transient server errors (5xx); backoff between
+                      attempts is exponential. Set to 0 to disable retries entirely.
+                      Supported for the OpenAI, AzureOpenAI, Anthropic, and Gemini providers;
+                      ignored by other providers.
+                    maximum: 20
+                    minimum: 0
+                    type: integer
+                required:
+                - attempts
+                type: object
               sapAICore:
                 description: SAP AI Core-specific configuration
                 properties:
diff --git a/helm/kagent-crds/templates/kagent.dev_sandboxagents.yaml b/helm/kagent-crds/templates/kagent.dev_sandboxagents.yaml
index 3f8f594b50..516904f45e 100644
--- a/helm/kagent-crds/templates/kagent.dev_sandboxagents.yaml
+++ b/helm/kagent-crds/templates/kagent.dev_sandboxagents.yaml
@@ -10767,6 +10767,35 @@ spec:
                         maxItems: 20
                         type: array
                     type: object
+                  reliability:
+                    description: |-
+                      Reliability configures self-healing and observability behaviors for the
+                      agent runtime, such as reflect-and-retry on failed tool calls, model call
+                      caps, and debug logging.
+                    properties:
+                      debugLogging:
+                        description: |-
+                          DebugLogging enables verbose runtime logging of every LLM
+                          request/response and tool call to the agent pod logs.
+                          Useful for debugging agent behavior; off by default.
+                        type: boolean
+                      maxLLMCalls:
+                        description: |-
+                          MaxLLMCalls caps the total number of model calls per request (cost safety
+                          rail). When the cap is exceeded the run stops with a clear error instead
+                          of looping. If unset, the runtime default applies (500).
+                        minimum: 1
+                        type: integer
+                      toolRetries:
+                        description: |-
+                          ToolRetries is the maximum number of consecutive failures for a tool call
+                          before the agent stops retrying it. When set, failed tool calls are followed
+                          by structured reflection guidance injected into the model context so the
+                          agent can self-correct instead of repeating the same failing call.
+                        maximum: 10
+                        minimum: 1
+                        type: integer
+                    type: object
                   runtime:
                     default: python
                     description: |-
diff --git a/python/packages/kagent-adk/src/kagent/adk/_a2a.py b/python/packages/kagent-adk/src/kagent/adk/_a2a.py
index 7b08e6f179..7db6080b63 100644
--- a/python/packages/kagent-adk/src/kagent/adk/_a2a.py
+++ b/python/packages/kagent-adk/src/kagent/adk/_a2a.py
@@ -139,7 +139,14 @@ def create_runner() -> Runner:
 
         agent_executor = A2aAgentExecutor(
             runner=create_runner,
-            config=A2aAgentExecutorConfig(stream=self.stream),
+            config=A2aAgentExecutorConfig(
+                stream=self.stream,
+                max_llm_calls=(
+                    self.agent_config.reliability.max_llm_calls
+                    if self.agent_config and self.agent_config.reliability
+                    else None
+                ),
+            ),
             task_store=task_store,
         )
 
diff --git a/python/packages/kagent-adk/src/kagent/adk/_agent_executor.py b/python/packages/kagent-adk/src/kagent/adk/_agent_executor.py
index 0ed10a3177..04a8cde307 100644
--- a/python/packages/kagent-adk/src/kagent/adk/_agent_executor.py
+++ b/python/packages/kagent-adk/src/kagent/adk/_agent_executor.py
@@ -64,6 +64,8 @@ class A2aAgentExecutorConfig(BaseModel):
     """Configuration for the KAgent A2aAgentExecutor."""
 
     stream: bool = False
+    # Cap on the total number of model calls per request (None = ADK default).
+    max_llm_calls: int | None = None
 
 
 def _kagent_request_converter(request, _part_converter=None):
@@ -209,7 +211,8 @@ async def _execute_impl(
 
         # Convert the a2a request to ADK run args
         stream = self._kagent_config.stream if self._kagent_config is not None else False
-        run_args = convert_a2a_request_to_adk_run_args(context, stream=stream)
+        max_llm_calls = self._kagent_config.max_llm_calls if self._kagent_config is not None else None
+        run_args = convert_a2a_request_to_adk_run_args(context, stream=stream, max_llm_calls=max_llm_calls)
 
         # Prepare span attributes.
         span_attributes = {}
@@ -250,9 +253,22 @@ async def _execute_impl(
             except Exception as e:
                 logger.error("Error handling A2A request: %s", e, exc_info=True)
 
-                # Check if this is a LiteLLM JSON parsing error (common with Ollama models that don't support function calling)
                 error_message = str(e)
-                if (
+
+                # LLM call cap (reliability.maxLLMCalls) exceeded — surface a clear,
+                # user-facing message instead of the raw exception text.
+                from google.adk.agents.invocation_context import LlmCallsLimitExceededError
+
+                if isinstance(e, LlmCallsLimitExceededError):
+                    max_calls = self._kagent_config.max_llm_calls if self._kagent_config is not None else None
+                    limit = f" {max_calls}" if max_calls is not None else ""
+                    error_message = (
+                        f"Agent stopped: exceeded the configured limit of{limit} model calls for a single request. "
+                        "This safety rail prevents runaway loops. If the task legitimately needs more model calls, "
+                        "increase reliability.maxLLMCalls on the agent."
+                    )
+                # Check if this is a LiteLLM JSON parsing error (common with Ollama models that don't support function calling)
+                elif (
                     "JSONDecodeError" in error_message
                     or "Unterminated string" in error_message
                     or "APIConnectionError" in error_message
diff --git a/python/packages/kagent-adk/src/kagent/adk/_reflect_retry_plugin.py b/python/packages/kagent-adk/src/kagent/adk/_reflect_retry_plugin.py
new file mode 100644
index 0000000000..79c681ec7c
--- /dev/null
+++ b/python/packages/kagent-adk/src/kagent/adk/_reflect_retry_plugin.py
@@ -0,0 +1,33 @@
+from typing import Any, Optional
+
+from google.adk.plugins import ReflectAndRetryToolPlugin
+from google.adk.tools import BaseTool, ToolContext
+
+
+class KAgentReflectAndRetryToolPlugin(ReflectAndRetryToolPlugin):
+    """ReflectAndRetryToolPlugin that handles both tool failure modes.
+
+    Two ways a tool call can fail:
+
+    1. The tool raises an exception — handled by the base class via
+       ``on_tool_error_callback`` (inherited unchanged).
+    2. The tool returns normally but reports an error in-band — MCP tools
+       never raise; they return a CallToolResult dict with ``isError: True``.
+       The base plugin ignores these, so we override
+       ``extract_error_from_result`` to treat them as failures.
+
+    Both paths feed the same per-tool failure counter and reflect-and-retry
+    flow, and a successful call resets the counter.
+    """
+
+    async def extract_error_from_result(
+        self,
+        *,
+        tool: BaseTool,
+        tool_args: dict[str, Any],
+        tool_context: ToolContext,
+        result: Any,
+    ) -> Optional[Any]:
+        if isinstance(result, dict) and result.get("isError"):
+            return result
+        return None
diff --git a/python/packages/kagent-adk/src/kagent/adk/cli.py b/python/packages/kagent-adk/src/kagent/adk/cli.py
index e32d0aacbf..c7e37a77dc 100644
--- a/python/packages/kagent-adk/src/kagent/adk/cli.py
+++ b/python/packages/kagent-adk/src/kagent/adk/cli.py
@@ -78,6 +78,29 @@ def static(
             plugins = []
         plugins.append(LLMPassthroughPlugin())
 
+    if agent_config.reliability and agent_config.reliability.debug_logging:
+        from google.adk.plugins import LoggingPlugin
+
+        if plugins is None:
+            plugins = []
+        plugins.append(LoggingPlugin())
+        logger.info("Debug logging plugin enabled")
+
+    if agent_config.reliability and agent_config.reliability.tool_retries:
+        from ._reflect_retry_plugin import KAgentReflectAndRetryToolPlugin
+
+        if plugins is None:
+            plugins = []
+        plugins.append(
+            KAgentReflectAndRetryToolPlugin(
+                max_retries=agent_config.reliability.tool_retries,
+                # Return reflection guidance to the model instead of raising,
+                # so the conversation degrades gracefully after max retries.
+                throw_exception_if_retry_exceeded=False,
+            )
+        )
+        logger.info(f"Reflect-and-retry plugin enabled (tool_retries={agent_config.reliability.tool_retries})")
+
     def root_agent_factory() -> BaseAgent:
         root_agent = agent_config.to_agent(app_cfg.name, sts_integration, propagate_token)
 
diff --git a/python/packages/kagent-adk/src/kagent/adk/converters/request_converter.py b/python/packages/kagent-adk/src/kagent/adk/converters/request_converter.py
index 88844daf68..587cfd67c6 100644
--- a/python/packages/kagent-adk/src/kagent/adk/converters/request_converter.py
+++ b/python/packages/kagent-adk/src/kagent/adk/converters/request_converter.py
@@ -20,10 +20,15 @@ def _get_user_id(request: RequestContext) -> str:
 def convert_a2a_request_to_adk_run_args(
     request: RequestContext,
     stream: bool = False,
+    max_llm_calls: int | None = None,
 ) -> dict[str, Any]:
     if not request.message:
         raise ValueError("Request message cannot be None")
 
+    run_config_kwargs: dict[str, Any] = {"streaming_mode": StreamingMode.SSE if stream else StreamingMode.NONE}
+    if max_llm_calls is not None:
+        run_config_kwargs["max_llm_calls"] = max_llm_calls
+
     return {
         "user_id": _get_user_id(request),
         "session_id": request.context_id,
@@ -31,5 +36,5 @@ def convert_a2a_request_to_adk_run_args(
             role="user",
             parts=[convert_a2a_part_to_genai_part(part) for part in request.message.parts],
         ),
-        "run_config": RunConfig(streaming_mode=StreamingMode.SSE if stream else StreamingMode.NONE),
+        "run_config": RunConfig(**run_config_kwargs),
     }
diff --git a/python/packages/kagent-adk/src/kagent/adk/models/_anthropic.py b/python/packages/kagent-adk/src/kagent/adk/models/_anthropic.py
index b8e9e68cd0..6848e2ce23 100644
--- a/python/packages/kagent-adk/src/kagent/adk/models/_anthropic.py
+++ b/python/packages/kagent-adk/src/kagent/adk/models/_anthropic.py
@@ -20,6 +20,10 @@ class KAgentAnthropicLlm(KAgentTLSMixin, AnthropicLlm):
 
     api_key_passthrough: Optional[bool] = None
 
+    # Max retry attempts for failed HTTP requests (429, 408, transient 5xx).
+    # Mapped to the Anthropic SDK's max_retries (exponential backoff). None = SDK default (2).
+    max_retries: Optional[int] = None
+
     _api_key: Optional[str] = None
     base_url: Optional[str] = None
     extra_headers: Optional[dict[str, str]] = None
@@ -51,6 +55,8 @@ def _anthropic_client(self) -> AsyncAnthropic:
             kwargs["base_url"] = self.base_url
         if self.extra_headers:
             kwargs["default_headers"] = self.extra_headers
+        if self.max_retries is not None:
+            kwargs["max_retries"] = self.max_retries
 
         # Use the httpx.AsyncClient with SSL configuration if present
         http_client = self._create_http_client()
diff --git a/python/packages/kagent-adk/src/kagent/adk/models/_openai.py b/python/packages/kagent-adk/src/kagent/adk/models/_openai.py
index ae6fd6df46..045f801313 100644
--- a/python/packages/kagent-adk/src/kagent/adk/models/_openai.py
+++ b/python/packages/kagent-adk/src/kagent/adk/models/_openai.py
@@ -385,6 +385,10 @@ class BaseOpenAI(KAgentTLSMixin, BaseLlm):
     # API key passthrough: forward the Bearer token from incoming requests as the LLM API key
     api_key_passthrough: Optional[bool] = None
 
+    # Max retry attempts for failed HTTP requests (429, 408, transient 5xx).
+    # Mapped to the OpenAI SDK's max_retries (exponential backoff). None = SDK default (2).
+    max_retries: Optional[int] = None
+
     # GDCH token exchange: refreshes a short-lived bearer token before each model call.
     token_exchange: Optional[GDCHTokenSource] = Field(default=None, exclude=True)
 
@@ -415,12 +419,17 @@ def _client(self) -> AsyncOpenAI:
         """Get the OpenAI client with optional custom SSL configuration."""
         http_client = self._create_http_client()
 
+        kwargs = {}
+        if self.max_retries is not None:
+            kwargs["max_retries"] = self.max_retries
+
         return AsyncOpenAI(
             api_key=self.api_key,
             base_url=self.base_url or None,
             default_headers=self.default_headers,
             timeout=self.timeout,
             http_client=http_client,
+            **kwargs,
         )
 
     async def generate_content_async(
@@ -633,10 +642,15 @@ def _client(self) -> AsyncAzureOpenAI:
 
         http_client = self._create_http_client()
 
+        kwargs = {}
+        if self.max_retries is not None:
+            kwargs["max_retries"] = self.max_retries
+
         return AsyncAzureOpenAI(
             api_key=api_key,
             api_version=api_version,
             azure_endpoint=azure_endpoint,
             default_headers=self.default_headers,
             http_client=http_client,
+            **kwargs,
         )
diff --git a/python/packages/kagent-adk/src/kagent/adk/types.py b/python/packages/kagent-adk/src/kagent/adk/types.py
index 889ed73e0b..4db279c55a 100644
--- a/python/packages/kagent-adk/src/kagent/adk/types.py
+++ b/python/packages/kagent-adk/src/kagent/adk/types.py
@@ -15,7 +15,6 @@
 
 from kagent.adk._approval import make_approval_callback, strip_confirmation_parts_callback
 from kagent.adk._mcp_toolset import KAgentMcpToolset
-from kagent.adk.models._ssl import create_ssl_context
 from kagent.adk._remote_a2a_tool import KAgentRemoteA2AToolset
 from kagent.adk.models._anthropic import KAgentAnthropicLlm
 from kagent.adk.models._bedrock import KAgentBedrockLlm
@@ -23,6 +22,7 @@
 from kagent.adk.models._ollama import create_ollama_llm
 from kagent.adk.models._openai import AzureOpenAI as OpenAIAzure
 from kagent.adk.models._openai import OpenAI as OpenAINative
+from kagent.adk.models._ssl import create_ssl_context
 from kagent.adk.sandbox_code_executer import SandboxedLocalCodeExecutor
 from kagent.adk.tools.ask_user_tool import AskUserTool
 
@@ -254,6 +254,10 @@ class BaseLLM(BaseModel):
     # API key passthrough: forward the Bearer token from incoming requests as the LLM API key
     api_key_passthrough: bool | None = None
 
+    # Max retry attempts for failed LLM HTTP requests (429, 408, transient 5xx),
+    # retried with exponential backoff by the provider SDK. None = SDK default.
+    max_retries: int | None = None
+
 
 class GDCHTokenExchangeConfig(BaseModel):
     service_account_path: str
@@ -361,6 +365,17 @@ class NetworkConfig(BaseModel):
     allowed_domains: list[str] = Field(default_factory=list)
 
 
+class ReliabilityConfig(BaseModel):
+    """Reliability configuration for self-healing and observability behaviors."""
+
+    # Max consecutive failures for a tool call before the agent stops retrying it.
+    tool_retries: int | None = None
+    # Cap on the total number of model calls per request (cost safety rail).
+    max_llm_calls: int | None = None
+    # Log every LLM request/response and tool call to the agent pod logs.
+    debug_logging: bool | None = None
+
+
 class AgentConfig(BaseModel):
     model: ModelUnion = Field(discriminator="type")
     description: str
@@ -373,6 +388,7 @@ class AgentConfig(BaseModel):
     memory: MemoryConfig | None = None  # Memory configuration
     network: NetworkConfig | None = None
     context_config: ContextConfig | None = None
+    reliability: ReliabilityConfig | None = None  # Self-healing: retries, call caps, debug logging
 
     def to_agent(
         self, name: str, sts_integration: Optional[ADKTokenPropagationPlugin] = None, propagate_token: bool = False
@@ -604,6 +620,18 @@ def _create_llm_from_model_config(model_config: ModelUnion):
     extra_headers = model_config.headers or {}
     base_url = getattr(model_config, "base_url", None)
 
+    if model_config.max_retries is not None and model_config.type not in (
+        "openai",
+        "azure_openai",
+        "anthropic",
+        "gemini",
+    ):
+        logger.warning(
+            "retry.attempts is not supported for model type %s; ignoring (max_retries=%d)",
+            model_config.type,
+            model_config.max_retries,
+        )
+
     if model_config.type == "openai":
         from .models._token_source import GDCHTokenSource
 
@@ -640,6 +668,7 @@ def _create_llm_from_model_config(model_config: ModelUnion):
             temperature=model_config.temperature,
             timeout=model_config.timeout,
             top_p=model_config.top_p,
+            max_retries=model_config.max_retries,
             token_exchange=token_exchange,
             **_transport_kwargs(model_config),
         )
@@ -648,6 +677,7 @@ def _create_llm_from_model_config(model_config: ModelUnion):
             model=model_config.model,
             base_url=base_url,
             extra_headers=extra_headers,
+            max_retries=model_config.max_retries,
             **_transport_kwargs(model_config),
         )
     if model_config.type == "gemini_vertex_ai":
@@ -668,12 +698,20 @@ def _create_llm_from_model_config(model_config: ModelUnion):
             model=model_config.model,
             type="azure_openai",
             default_headers=extra_headers,
+            max_retries=model_config.max_retries,
             **_transport_kwargs(model_config),
         )
     if model_config.type == "gemini":
+        retry_options = None
+        if model_config.max_retries is not None:
+            from google.genai import types as genai_types
+
+            # HttpRetryOptions.attempts counts the initial request too.
+            retry_options = genai_types.HttpRetryOptions(attempts=model_config.max_retries + 1)
         return KAgentGeminiLlm(
             model=model_config.model,
             extra_headers=extra_headers,
+            retry_options=retry_options,
             **_transport_kwargs(model_config),
         )
     if model_config.type == "bedrock":
diff --git a/python/packages/kagent-adk/tests/unittests/models/test_sap_ai_core.py b/python/packages/kagent-adk/tests/unittests/models/test_sap_ai_core.py
index aace9575a9..fab636b547 100644
--- a/python/packages/kagent-adk/tests/unittests/models/test_sap_ai_core.py
+++ b/python/packages/kagent-adk/tests/unittests/models/test_sap_ai_core.py
@@ -17,7 +17,6 @@
     _parse_orchestration_chunk,
 )
 
-
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
diff --git a/python/packages/kagent-adk/tests/unittests/test_model_retry.py b/python/packages/kagent-adk/tests/unittests/test_model_retry.py
new file mode 100644
index 0000000000..306e0cd2c5
--- /dev/null
+++ b/python/packages/kagent-adk/tests/unittests/test_model_retry.py
@@ -0,0 +1,74 @@
+"""Tests for ModelConfig retry (max_retries) plumbing into provider SDK clients."""
+
+import json
+
+from kagent.adk.types import AgentConfig, _create_llm_from_model_config
+
+
+def _make_model_config(**model_extra):
+    config = {
+        "model": {"type": "openai", "model": "gpt-4", **model_extra},
+        "description": "test agent",
+        "instruction": "test instruction",
+    }
+    return AgentConfig.model_validate_json(json.dumps(config)).model
+
+
+class TestMaxRetriesParsing:
+    def test_default_unset(self):
+        model = _make_model_config()
+        assert model.max_retries is None
+
+    def test_max_retries_parsed(self):
+        model = _make_model_config(max_retries=5)
+        assert model.max_retries == 5
+
+
+class TestMaxRetriesWiring:
+    def test_openai_client_max_retries(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-key")
+        llm = _create_llm_from_model_config(_make_model_config(max_retries=5))
+        assert llm.max_retries == 5
+        assert llm._client.max_retries == 5
+
+    def test_openai_client_default_when_unset(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-key")
+        llm = _create_llm_from_model_config(_make_model_config())
+        assert llm.max_retries is None
+        # OpenAI SDK default (2) applies when unset.
+        assert llm._client.max_retries == 2
+
+    def test_anthropic_client_max_retries(self, monkeypatch):
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key")
+        config = {
+            "model": {"type": "anthropic", "model": "claude-sonnet-4-5", "max_retries": 4},
+            "description": "d",
+            "instruction": "i",
+        }
+        llm = AgentConfig.model_validate_json(json.dumps(config)).model
+        llm = _create_llm_from_model_config(llm)
+        assert llm.max_retries == 4
+        assert llm._anthropic_client.max_retries == 4
+
+    def test_gemini_retry_options(self):
+        config = {
+            "model": {"type": "gemini", "model": "gemini-2.0-flash", "max_retries": 3},
+            "description": "d",
+            "instruction": "i",
+        }
+        model = AgentConfig.model_validate_json(json.dumps(config)).model
+        llm = _create_llm_from_model_config(model)
+        # HttpRetryOptions.attempts counts the initial request too.
+        assert llm.retry_options is not None
+        assert llm.retry_options.attempts == 4
+
+    def test_unsupported_provider_logs_warning(self, caplog):
+        config = {
+            "model": {"type": "ollama", "model": "llama3", "max_retries": 3},
+            "description": "d",
+            "instruction": "i",
+        }
+        model = AgentConfig.model_validate_json(json.dumps(config)).model
+        with caplog.at_level("WARNING"):
+            _create_llm_from_model_config(model)
+        assert any("retry.attempts is not supported" in r.message for r in caplog.records)
diff --git a/python/packages/kagent-adk/tests/unittests/test_reliability_config.py b/python/packages/kagent-adk/tests/unittests/test_reliability_config.py
new file mode 100644
index 0000000000..3699fe219a
--- /dev/null
+++ b/python/packages/kagent-adk/tests/unittests/test_reliability_config.py
@@ -0,0 +1,96 @@
+import json
+from types import SimpleNamespace
+
+import pytest
+
+from kagent.adk._reflect_retry_plugin import KAgentReflectAndRetryToolPlugin
+from kagent.adk.types import AgentConfig, ReliabilityConfig
+
+
+def _make_agent_config(**extra) -> AgentConfig:
+    config = {
+        "model": {"type": "openai", "model": "gpt-4"},
+        "description": "test agent",
+        "instruction": "test instruction",
+    }
+    config.update(extra)
+    return AgentConfig.model_validate_json(json.dumps(config))
+
+
+class TestReliabilityConfigParsing:
+    def test_defaults_unset(self):
+        config = _make_agent_config()
+        assert config.reliability is None
+
+    def test_tool_retries(self):
+        config = _make_agent_config(reliability={"tool_retries": 3})
+        assert config.reliability == ReliabilityConfig(tool_retries=3)
+
+    def test_max_llm_calls(self):
+        config = _make_agent_config(reliability={"tool_retries": 3, "max_llm_calls": 25})
+        assert config.reliability == ReliabilityConfig(tool_retries=3, max_llm_calls=25)
+
+    def test_debug_logging(self):
+        config = _make_agent_config(reliability={"debug_logging": True})
+        assert config.reliability is not None
+        assert config.reliability.debug_logging is True
+
+
+class TestKAgentReflectAndRetryToolPlugin:
+    @pytest.mark.asyncio
+    async def test_mcp_error_result_detected(self):
+        plugin = KAgentReflectAndRetryToolPlugin(max_retries=2, throw_exception_if_retry_exceeded=False)
+        result = {"content": [{"type": "text", "text": "apply failed: exit status 1"}], "isError": True}
+        error = await plugin.extract_error_from_result(tool=None, tool_args={}, tool_context=None, result=result)
+        assert error == result
+
+    @pytest.mark.asyncio
+    async def test_successful_result_not_detected(self):
+        plugin = KAgentReflectAndRetryToolPlugin(max_retries=2, throw_exception_if_retry_exceeded=False)
+        result = {"content": [{"type": "text", "text": "applied"}], "isError": False}
+        assert await plugin.extract_error_from_result(tool=None, tool_args={}, tool_context=None, result=result) is None
+        assert await plugin.extract_error_from_result(tool=None, tool_args={}, tool_context=None, result="plain") is None
+
+    @pytest.mark.asyncio
+    async def test_exception_path_still_handled(self):
+        """Tools that raise exceptions go through the inherited on_tool_error_callback."""
+        plugin = KAgentReflectAndRetryToolPlugin(max_retries=2, throw_exception_if_retry_exceeded=False)
+        tool = SimpleNamespace(name="kubectl_apply")
+        ctx = SimpleNamespace(invocation_id="inv-1")
+        response = await plugin.on_tool_error_callback(
+            tool=tool, tool_args={"manifest": "bad"}, tool_context=ctx, error=RuntimeError("boom")
+        )
+        assert response is not None
+        assert "kubectl_apply" in str(response)
+
+    @pytest.mark.asyncio
+    async def test_exception_and_iserror_share_failure_counter(self):
+        """An exception followed by an isError result counts as 2 attempts for the same tool."""
+        plugin = KAgentReflectAndRetryToolPlugin(max_retries=2, throw_exception_if_retry_exceeded=False)
+        tool = SimpleNamespace(name="kubectl_apply")
+        ctx = SimpleNamespace(invocation_id="inv-1")
+        error_result = {"content": [{"type": "text", "text": "failed"}], "isError": True}
+
+        # Attempt 1: exception
+        await plugin.on_tool_error_callback(tool=tool, tool_args={}, tool_context=ctx, error=RuntimeError("boom"))
+        # Attempt 2: isError result (routed via after_tool_callback)
+        await plugin.after_tool_callback(tool=tool, tool_args={}, tool_context=ctx, result=error_result)
+        # Attempt 3: exceeds max_retries=2 -> retry-exceeded guidance instead of raising
+        response = await plugin.after_tool_callback(tool=tool, tool_args={}, tool_context=ctx, result=error_result)
+        assert response is not None
+        assert "2" in str(response)  # mentions the retry limit
+
+    @pytest.mark.asyncio
+    async def test_success_resets_counter(self):
+        plugin = KAgentReflectAndRetryToolPlugin(max_retries=1, throw_exception_if_retry_exceeded=False)
+        tool = SimpleNamespace(name="kubectl_apply")
+        ctx = SimpleNamespace(invocation_id="inv-1")
+        error_result = {"isError": True}
+        ok_result = {"content": [{"type": "text", "text": "applied"}], "isError": False}
+
+        await plugin.after_tool_callback(tool=tool, tool_args={}, tool_context=ctx, result=error_result)
+        # Success resets the per-tool counter
+        assert await plugin.after_tool_callback(tool=tool, tool_args={}, tool_context=ctx, result=ok_result) is None
+        # Next failure is attempt 1 again -> reflection guidance, not retry-exceeded
+        response = await plugin.after_tool_callback(tool=tool, tool_args={}, tool_context=ctx, result=error_result)
+        assert response is not None
diff --git a/ui/src/app/actions/agents.ts b/ui/src/app/actions/agents.ts
index 2ba29c4101..7a942ba83b 100644
--- a/ui/src/app/actions/agents.ts
+++ b/ui/src/app/actions/agents.ts
@@ -203,6 +203,14 @@ function fromAgentFormDataToAgent(agentFormData: AgentFormData): Agent {
       base.spec!.declarative!.context = agentFormData.context;
     }
 
+    if (agentFormData.toolRetries || agentFormData.maxLLMCalls || agentFormData.debugLogging) {
+      base.spec!.declarative!.reliability = {
+        toolRetries: agentFormData.toolRetries,
+        maxLLMCalls: agentFormData.maxLLMCalls,
+        debugLogging: agentFormData.debugLogging || undefined,
+      };
+    }
+
     const trimmedSA = agentFormData.serviceAccountName?.trim();
     if (trimmedSA) {
       base.spec!.declarative!.deployment = {
@@ -373,6 +381,14 @@ function fromAgentFormDataToSandboxAgent(agentFormData: AgentFormData): SandboxA
     decl.context = agentFormData.context;
   }
 
+  if (agentFormData.toolRetries || agentFormData.maxLLMCalls || agentFormData.debugLogging) {
+    decl.reliability = {
+      toolRetries: agentFormData.toolRetries,
+      maxLLMCalls: agentFormData.maxLLMCalls,
+      debugLogging: agentFormData.debugLogging || undefined,
+    };
+  }
+
   const trimmedSA = agentFormData.serviceAccountName?.trim();
   if (trimmedSA) {
     decl.deployment = {
diff --git a/ui/src/app/agents/new/page.tsx b/ui/src/app/agents/new/page.tsx
index 69e567d275..16817e756f 100644
--- a/ui/src/app/agents/new/page.tsx
+++ b/ui/src/app/agents/new/page.tsx
@@ -1,6 +1,6 @@
 "use client";
 import React, { useState, useEffect, Suspense, useCallback, useMemo } from "react";
-import { Loader2 } from "lucide-react";
+import { ChevronDown, ChevronRight, Loader2 } from "lucide-react";
 import { Input } from "@/components/ui/input";
 import { Button } from "@/components/ui/button";
 import { Textarea } from "@/components/ui/textarea";
@@ -105,6 +105,12 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo
     /** Python vs Go ADK (`spec.declarative.runtime`). */
     declarativeRuntime: DeclarativeRuntime;
     contextConfig: ContextConfig | undefined;
+    /** Max consecutive tool-call failures before the agent stops retrying (empty = disabled). */
+    toolRetries: string;
+    /** Cap on total model calls per request (empty = runtime default). */
+    maxLLMCalls: string;
+    /** Log every LLM request/response and tool call to pod logs. */
+    debugLogging: boolean;
     serviceAccountName: string;
     promptSourceRows: PromptSourceRow[];
     isSubmitting: boolean;
@@ -141,6 +147,9 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo
     stream: false,
     declarativeRuntime: "python",
     contextConfig: undefined,
+    toolRetries: "",
+    maxLLMCalls: "",
+    debugLogging: false,
     serviceAccountName: "",
     promptSourceRows: [newPromptSourceRow()],
     isSubmitting: false,
@@ -151,6 +160,8 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo
     substrateSnapshotsLocation: "",
   });
 
+  const [isAdvancedSectionExpanded, setIsAdvancedSectionExpanded] = useState(false);
+
   const substrateEnabled = useSubstrateEnabled();
 
   // When substrate becomes available, prefer it for sandbox agents still on the default platform.
@@ -297,11 +308,17 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo
                     : null,
                   memoryTtlDays: memorySpec?.ttlDays ? String(memorySpec.ttlDays) : "",
                   contextConfig: decl?.context,
+                  toolRetries: decl?.reliability?.toolRetries ? String(decl.reliability.toolRetries) : "",
+                  maxLLMCalls: decl?.reliability?.maxLLMCalls ? String(decl.reliability.maxLLMCalls) : "",
+                  debugLogging: decl?.reliability?.debugLogging ?? false,
                   serviceAccountName: decl?.deployment?.serviceAccountName || "",
                   byoImage: "",
                   byoCmd: "",
                   byoArgs: "",
                 }));
+                if (decl?.reliability?.toolRetries || decl?.reliability?.maxLLMCalls || decl?.reliability?.debugLogging) {
+                  setIsAdvancedSectionExpanded(true);
+                }
               } else {
                 setState((prev) => ({
                   ...prev,
@@ -510,6 +527,11 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo
               }
             : undefined,
         context: useDeclarativeAgentFields ? state.contextConfig : undefined,
+        toolRetries:
+          useDeclarativeAgentFields && state.toolRetries ? parseInt(state.toolRetries, 10) : undefined,
+        maxLLMCalls:
+          useDeclarativeAgentFields && state.maxLLMCalls ? parseInt(state.maxLLMCalls, 10) : undefined,
+        debugLogging: useDeclarativeAgentFields ? state.debugLogging : undefined,
         declarativeRuntime: useDeclarativeAgentFields ? state.declarativeRuntime : undefined,
         byoImage: state.byoImage,
         byoCmd: state.byoCmd || undefined,
@@ -894,6 +916,90 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo
                       </div>
                     </div>
 
+                    <div className="rounded-md border border-border/60">
+                      <button
+                        type="button"
+                        className="flex w-full cursor-pointer items-center justify-between px-3 py-2.5 text-left"
+                        onClick={() => setIsAdvancedSectionExpanded((prev) => !prev)}
+                        aria-expanded={isAdvancedSectionExpanded}
+                        aria-controls="agent-advanced-fields"
+                      >
+                        <span className="space-y-0.5">
+                          <span className="block text-sm font-medium text-foreground">Advanced</span>
+                          <span className="block text-xs text-muted-foreground">
+                            Reliability and debugging: tool retries, model call limits, debug logging
+                          </span>
+                        </span>
+                        {isAdvancedSectionExpanded ? (
+                          <ChevronDown className="h-4 w-4 shrink-0 text-muted-foreground" />
+                        ) : (
+                          <ChevronRight className="h-4 w-4 shrink-0 text-muted-foreground" />
+                        )}
+                      </button>
+                      {isAdvancedSectionExpanded && (
+                        <div id="agent-advanced-fields" className="space-y-5 border-t border-border/60 p-3">
+                          <FieldRoot>
+                            <FieldLabel htmlFor="tool-retries">Tool retries</FieldLabel>
+                            <Input
+                              id="tool-retries"
+                              type="number"
+                              min={1}
+                              max={10}
+                              placeholder="Disabled"
+                              value={state.toolRetries}
+                              onChange={(e) => setState((prev) => ({ ...prev, toolRetries: e.target.value }))}
+                              disabled={disabled}
+                              className="max-w-[180px]"
+                            />
+                            <FieldHint>
+                              Self-healing: when a tool call fails, inject reflection guidance and retry up to this
+                              many times before giving up. Leave empty to disable.
+                            </FieldHint>
+                          </FieldRoot>
+
+                          <FieldRoot>
+                            <FieldLabel htmlFor="max-llm-calls">Max model calls per request</FieldLabel>
+                            <Input
+                              id="max-llm-calls"
+                              type="number"
+                              min={1}
+                              placeholder="Runtime default (500)"
+                              value={state.maxLLMCalls}
+                              onChange={(e) => setState((prev) => ({ ...prev, maxLLMCalls: e.target.value }))}
+                              disabled={disabled}
+                              className="max-w-[180px]"
+                            />
+                            <FieldHint>
+                              Cost safety rail: stop the run with a clear error if the agent exceeds this many model
+                              calls in a single request (like a resource limit, but for tokens).
+                            </FieldHint>
+                          </FieldRoot>
+
+                          <div className="flex gap-3 rounded-md border border-border/60 bg-muted/20 p-3">
+                            <div className="flex h-5 shrink-0 items-center self-start">
+                              <Checkbox
+                                id="debug-logging-toggle"
+                                checked={state.debugLogging}
+                                onCheckedChange={(checked) => setState((prev) => ({ ...prev, debugLogging: !!checked }))}
+                                disabled={disabled}
+                              />
+                            </div>
+                            <div className="min-w-0 space-y-1.5">
+                              <Label
+                                htmlFor="debug-logging-toggle"
+                                className="block cursor-pointer text-sm font-medium leading-5 text-foreground"
+                              >
+                                Debug logging
+                              </Label>
+                              <p className="text-xs leading-snug text-muted-foreground">
+                                Log every LLM request/response and tool call to the agent pod logs
+                              </p>
+                            </div>
+                          </div>
+                        </div>
+                      )}
+                    </div>
+
                     <ServiceAccountNameField
                       value={state.serviceAccountName}
                       onChange={(v) => setState((prev) => ({ ...prev, serviceAccountName: v }))}
diff --git a/ui/src/app/models/new/page.tsx b/ui/src/app/models/new/page.tsx
index 1adc821ee2..f0d5d6d7f5 100644
--- a/ui/src/app/models/new/page.tsx
+++ b/ui/src/app/models/new/page.tsx
@@ -1,7 +1,9 @@
 "use client";
 import React, { useState, useEffect } from "react";
 import { Button } from "@/components/ui/button";
-import { Loader2 } from "lucide-react";
+import { Input } from "@/components/ui/input";
+import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card";
+import { Loader2, ChevronDown, ChevronRight } from "lucide-react";
 import { useRouter, useSearchParams } from "next/navigation";
 import { LoadingState } from "@/components/LoadingState";
 import { ErrorState } from "@/components/ErrorState";
@@ -133,6 +135,8 @@ function ModelPageContent() {
   const [errors, setErrors] = useState<ValidationErrors>({});
   const [isApiKeyNeeded, setIsApiKeyNeeded] = useState(true);
   const [isParamsSectionExpanded, setIsParamsSectionExpanded] = useState(false);
+  const [isAdvancedSectionExpanded, setIsAdvancedSectionExpanded] = useState(false);
+  const [retryAttempts, setRetryAttempts] = useState("");
   const [isFetchingModels, setIsFetchingModels] = useState(false);
   const [existingApiKeySecret, setExistingApiKeySecret] = useState("");
   const [existingApiKeySecretKey, setExistingApiKeySecretKey] = useState("");
@@ -245,6 +249,13 @@ function ModelPageContent() {
           setExistingApiKeySecret(modelData.spec.apiKeySecret || "");
           setExistingApiKeySecretKey(modelData.spec.apiKeySecretKey || "");
 
+          if (modelData.spec.retry?.attempts != null) {
+            setRetryAttempts(String(modelData.spec.retry.attempts));
+            setIsAdvancedSectionExpanded(true);
+          } else {
+            setRetryAttempts("");
+          }
+
           const spec = modelData.spec;
           const fetchedParams: Record<string, unknown> =
             (spec.openAI ?? spec.anthropic ?? spec.azureOpenAI ?? spec.ollama ??
@@ -583,6 +594,13 @@ function ModelPageContent() {
       if (existingApiKeySecretKey) spec.apiKeySecretKey = existingApiKeySecretKey;
     }
 
+    if (retryAttempts.trim() !== "") {
+      const attempts = parseInt(retryAttempts, 10);
+      if (!isNaN(attempts) && attempts >= 0) {
+        spec.retry = { attempts };
+      }
+    }
+
     const providerType = finalSelectedProvider.type;
     switch (providerType) {
       case 'OpenAI':
@@ -766,6 +784,40 @@ function ModelPageContent() {
             />
           )}
 
+          <Card>
+            <CardHeader
+              className="flex flex-row items-center justify-between cursor-pointer"
+              onClick={() => setIsAdvancedSectionExpanded(!isAdvancedSectionExpanded)}
+            >
+              <CardTitle>Advanced</CardTitle>
+              {isAdvancedSectionExpanded
+                ? <ChevronDown className="h-5 w-5" />
+                : <ChevronRight className="h-5 w-5" />}
+            </CardHeader>
+            {isAdvancedSectionExpanded && (
+              <CardContent className="space-y-4">
+                <div className="space-y-1">
+                  <label htmlFor="retry-attempts" className="text-sm font-medium text-gray-800">Retry attempts</label>
+                  <Input
+                    id="retry-attempts"
+                    type="number"
+                    min={0}
+                    max={20}
+                    placeholder="Provider SDK default"
+                    value={retryAttempts}
+                    onChange={(e) => setRetryAttempts(e.target.value)}
+                    disabled={isSubmitting || isLoading}
+                  />
+                  <p className="text-xs text-muted-foreground">
+                    Automatically retry failed model requests (rate limits, timeouts, transient server errors) with
+                    exponential backoff. Set to 0 to disable retries. Supported for OpenAI, Azure OpenAI, Anthropic,
+                    and Gemini.
+                  </p>
+                </div>
+              </CardContent>
+            )}
+          </Card>
+
             <div className="flex justify-end border-t border-border/50 pt-6">
               <Button
                 type="submit"
diff --git a/ui/src/components/AgentsProvider.tsx b/ui/src/components/AgentsProvider.tsx
index 1f9a2acc81..fc1e2b4f55 100644
--- a/ui/src/components/AgentsProvider.tsx
+++ b/ui/src/components/AgentsProvider.tsx
@@ -50,6 +50,12 @@ export interface AgentFormData {
   };
   // Context management
   context?: ContextConfig;
+  /** Max consecutive failures per tool call before the agent stops retrying (reflect-and-retry). */
+  toolRetries?: number;
+  /** Cap on the total number of model calls per request (cost safety rail). */
+  maxLLMCalls?: number;
+  /** Log every LLM request/response and tool call to the agent pod logs. */
+  debugLogging?: boolean;
   promptSources?: Array<{ name: string; alias: string }>;
   /** AgentHarness CR (kagent.dev/v1alpha2 AgentHarness; openclaw, nemoclaw, or hermes backend). */
   agentHarness?: AgentHarnessFormSlice;
diff --git a/ui/src/types/index.ts b/ui/src/types/index.ts
index 7a6ee41841..014c71ac57 100644
--- a/ui/src/types/index.ts
+++ b/ui/src/types/index.ts
@@ -84,6 +84,11 @@ export interface TLSConfig {
   disableSystemCAs?: boolean;
 }
 
+export interface ModelRetryConfig {
+  /** Max retry attempts for failed LLM HTTP requests (429, 408, transient 5xx). 0 disables retries. */
+  attempts: number;
+}
+
 export interface ModelConfigSpec {
   model: string;
   provider: string;
@@ -92,6 +97,7 @@ export interface ModelConfigSpec {
   apiKeyPassthrough?: boolean;
   defaultHeaders?: Record<string, string>;
   tls?: TLSConfig;
+  retry?: ModelRetryConfig;
   openAI?: OpenAIConfig;
   anthropic?: AnthropicConfig;
   azureOpenAI?: AzureOpenAIConfig;
@@ -369,6 +375,17 @@ export interface DeclarativeAgentSpec {
   memory?: MemorySpec;
   /** When set, systemMessage is rendered as a Go text/template with includes and variables. */
   promptTemplate?: PromptTemplateSpec;
+  /** Self-healing and observability behaviors: tool retries, model call caps, debug logging. */
+  reliability?: ReliabilityConfig;
+}
+
+export interface ReliabilityConfig {
+  /** Max consecutive failures for a tool call before the agent stops retrying it. */
+  toolRetries?: number;
+  /** Cap on the total number of model calls per request (cost safety rail). */
+  maxLLMCalls?: number;
+  /** Log every LLM request/response and tool call to the agent pod logs. */
+  debugLogging?: boolean;
 }
 
 export interface ContextConfig {

From cffa7fa94a861fe8e14ad1d73090d1b40d9f201d Mon Sep 17 00:00:00 2001
From: Peter Jausovec <peter.jausovec@solo.io>
Date: Thu, 11 Jun 2026 16:23:24 -0700
Subject: [PATCH 2/2] fix pr feedback

Signed-off-by: Peter Jausovec <peter.jausovec@solo.io>
---
 .../src/kagent/adk/_agent_executor.py         |  8 +++--
 .../unittests/test_reliability_config.py      |  4 ++-
 ui/src/app/actions/agents.ts                  | 31 ++++++++++++-------
 ui/src/app/agents/new/page.tsx                | 12 ++++---
 ui/src/app/models/new/page.tsx                | 16 ++++++++--
 5 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/python/packages/kagent-adk/src/kagent/adk/_agent_executor.py b/python/packages/kagent-adk/src/kagent/adk/_agent_executor.py
index 04a8cde307..045def8aa4 100644
--- a/python/packages/kagent-adk/src/kagent/adk/_agent_executor.py
+++ b/python/packages/kagent-adk/src/kagent/adk/_agent_executor.py
@@ -261,9 +261,13 @@ async def _execute_impl(
 
                 if isinstance(e, LlmCallsLimitExceededError):
                     max_calls = self._kagent_config.max_llm_calls if self._kagent_config is not None else None
-                    limit = f" {max_calls}" if max_calls is not None else ""
+                    limit = (
+                        f"the configured limit of {max_calls}"
+                        if max_calls is not None
+                        else "the runtime default limit on"
+                    )
                     error_message = (
-                        f"Agent stopped: exceeded the configured limit of{limit} model calls for a single request. "
+                        f"Agent stopped: exceeded {limit} model calls for a single request. "
                         "This safety rail prevents runaway loops. If the task legitimately needs more model calls, "
                         "increase reliability.maxLLMCalls on the agent."
                     )
diff --git a/python/packages/kagent-adk/tests/unittests/test_reliability_config.py b/python/packages/kagent-adk/tests/unittests/test_reliability_config.py
index 3699fe219a..95cbe711c4 100644
--- a/python/packages/kagent-adk/tests/unittests/test_reliability_config.py
+++ b/python/packages/kagent-adk/tests/unittests/test_reliability_config.py
@@ -49,7 +49,9 @@ async def test_successful_result_not_detected(self):
         plugin = KAgentReflectAndRetryToolPlugin(max_retries=2, throw_exception_if_retry_exceeded=False)
         result = {"content": [{"type": "text", "text": "applied"}], "isError": False}
         assert await plugin.extract_error_from_result(tool=None, tool_args={}, tool_context=None, result=result) is None
-        assert await plugin.extract_error_from_result(tool=None, tool_args={}, tool_context=None, result="plain") is None
+        assert (
+            await plugin.extract_error_from_result(tool=None, tool_args={}, tool_context=None, result="plain") is None
+        )
 
     @pytest.mark.asyncio
     async def test_exception_path_still_handled(self):
diff --git a/ui/src/app/actions/agents.ts b/ui/src/app/actions/agents.ts
index 7a942ba83b..fddb520661 100644
--- a/ui/src/app/actions/agents.ts
+++ b/ui/src/app/actions/agents.ts
@@ -51,6 +51,19 @@ function attachPromptTemplateToDeclarative(decl: DeclarativeAgentSpec, agentForm
   }
 }
 
+function buildReliabilityForAgentSpec(agentFormData: AgentFormData): DeclarativeAgentSpec["reliability"] | undefined {
+  const sanitize = (v: number | undefined): number | undefined =>
+    typeof v === "number" && Number.isInteger(v) && v >= 0 ? v : undefined;
+  const toolRetries = sanitize(agentFormData.toolRetries);
+  const maxLLMCalls = sanitize(agentFormData.maxLLMCalls);
+  const debugLogging = agentFormData.debugLogging || undefined;
+
+  if (toolRetries === undefined && maxLLMCalls === undefined && !debugLogging) {
+    return undefined;
+  }
+  return { toolRetries, maxLLMCalls, debugLogging };
+}
+
 function buildSkillsForAgentSpec(agentFormData: AgentFormData): SkillForAgent | undefined {
   const refs = (agentFormData.skillRefs || []).map((r) => r.trim()).filter(Boolean);
   const rows: GitSkillFormRow[] = (agentFormData.skillGitRepos || []).map((g) => ({
@@ -203,12 +216,9 @@ function fromAgentFormDataToAgent(agentFormData: AgentFormData): Agent {
       base.spec!.declarative!.context = agentFormData.context;
     }
 
-    if (agentFormData.toolRetries || agentFormData.maxLLMCalls || agentFormData.debugLogging) {
-      base.spec!.declarative!.reliability = {
-        toolRetries: agentFormData.toolRetries,
-        maxLLMCalls: agentFormData.maxLLMCalls,
-        debugLogging: agentFormData.debugLogging || undefined,
-      };
+    const reliability = buildReliabilityForAgentSpec(agentFormData);
+    if (reliability) {
+      base.spec!.declarative!.reliability = reliability;
     }
 
     const trimmedSA = agentFormData.serviceAccountName?.trim();
@@ -381,12 +391,9 @@ function fromAgentFormDataToSandboxAgent(agentFormData: AgentFormData): SandboxA
     decl.context = agentFormData.context;
   }
 
-  if (agentFormData.toolRetries || agentFormData.maxLLMCalls || agentFormData.debugLogging) {
-    decl.reliability = {
-      toolRetries: agentFormData.toolRetries,
-      maxLLMCalls: agentFormData.maxLLMCalls,
-      debugLogging: agentFormData.debugLogging || undefined,
-    };
+  const reliability = buildReliabilityForAgentSpec(agentFormData);
+  if (reliability) {
+    decl.reliability = reliability;
   }
 
   const trimmedSA = agentFormData.serviceAccountName?.trim();
diff --git a/ui/src/app/agents/new/page.tsx b/ui/src/app/agents/new/page.tsx
index 16817e756f..ab7692916d 100644
--- a/ui/src/app/agents/new/page.tsx
+++ b/ui/src/app/agents/new/page.tsx
@@ -60,6 +60,12 @@ interface AgentPageContentProps {
   agentNamespace: string | null;
 }
 
+/** Parses a form field into a positive integer, returning undefined for empty/invalid input (avoids NaN in payloads). */
+function parseOptionalPositiveInt(value: string): number | undefined {
+  const parsed = parseInt(value, 10);
+  return Number.isInteger(parsed) && parsed > 0 ? parsed : undefined;
+}
+
 const DEFAULT_SYSTEM_PROMPT = `You're a helpful agent, made by the kagent team.
 
 # Instructions
@@ -527,10 +533,8 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo
               }
             : undefined,
         context: useDeclarativeAgentFields ? state.contextConfig : undefined,
-        toolRetries:
-          useDeclarativeAgentFields && state.toolRetries ? parseInt(state.toolRetries, 10) : undefined,
-        maxLLMCalls:
-          useDeclarativeAgentFields && state.maxLLMCalls ? parseInt(state.maxLLMCalls, 10) : undefined,
+        toolRetries: useDeclarativeAgentFields ? parseOptionalPositiveInt(state.toolRetries) : undefined,
+        maxLLMCalls: useDeclarativeAgentFields ? parseOptionalPositiveInt(state.maxLLMCalls) : undefined,
         debugLogging: useDeclarativeAgentFields ? state.debugLogging : undefined,
         declarativeRuntime: useDeclarativeAgentFields ? state.declarativeRuntime : undefined,
         byoImage: state.byoImage,
diff --git a/ui/src/app/models/new/page.tsx b/ui/src/app/models/new/page.tsx
index f0d5d6d7f5..19f57f4506 100644
--- a/ui/src/app/models/new/page.tsx
+++ b/ui/src/app/models/new/page.tsx
@@ -786,16 +786,26 @@ function ModelPageContent() {
 
           <Card>
             <CardHeader
+              role="button"
+              tabIndex={0}
+              aria-expanded={isAdvancedSectionExpanded}
+              aria-controls="model-advanced-fields"
               className="flex flex-row items-center justify-between cursor-pointer"
               onClick={() => setIsAdvancedSectionExpanded(!isAdvancedSectionExpanded)}
+              onKeyDown={(e) => {
+                if (e.key === "Enter" || e.key === " ") {
+                  e.preventDefault();
+                  setIsAdvancedSectionExpanded(!isAdvancedSectionExpanded);
+                }
+              }}
             >
               <CardTitle>Advanced</CardTitle>
               {isAdvancedSectionExpanded
-                ? <ChevronDown className="h-5 w-5" />
-                : <ChevronRight className="h-5 w-5" />}
+                ? <ChevronDown className="h-5 w-5" aria-hidden />
+                : <ChevronRight className="h-5 w-5" aria-hidden />}
             </CardHeader>
             {isAdvancedSectionExpanded && (
-              <CardContent className="space-y-4">
+              <CardContent id="model-advanced-fields" className="space-y-4">
                 <div className="space-y-1">
                   <label htmlFor="retry-attempts" className="text-sm font-medium text-gray-800">Retry attempts</label>
                   <Input