From 318933d53efa49f866040607b5c31f1d03e2e664 Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Thu, 11 Jun 2026 16:02:45 -0700 Subject: [PATCH 1/2] implement agent reliability features from ADK Signed-off-by: Peter Jausovec --- docs/architecture/crds-and-types.md | 13 +- go/adk/pkg/agent/agent.go | 14 + go/adk/pkg/models/anthropic.go | 9 + go/adk/pkg/models/base.go | 1 + go/adk/pkg/models/openai.go | 6 + go/adk/pkg/runner/adapter.go | 51 +++ go/adk/pkg/runner/adapter_test.go | 128 ++++++++ go/adk/pkg/runner/maxllmcalls.go | 42 +++ go/api/adk/types.go | 21 ++ .../config/crd/bases/kagent.dev_agents.yaml | 29 ++ .../crd/bases/kagent.dev_modelconfigs.yaml | 20 ++ .../crd/bases/kagent.dev_sandboxagents.yaml | 29 ++ go/api/v1alpha2/agent_types.go | 29 ++ go/api/v1alpha2/modelconfig_types.go | 20 ++ go/api/v1alpha2/zz_generated.deepcopy.go | 50 +++ .../translator/agent/adk_api_translator.go | 51 +-- .../controller/translator/agent/compiler.go | 13 + .../inputs/agent_with_reliability.yaml | 37 +++ .../inputs/modelconfig_with_retry.yaml | 35 ++ .../outputs/agent_with_reliability.json | 299 ++++++++++++++++++ .../outputs/modelconfig_with_retry.json | 295 +++++++++++++++++ .../templates/kagent.dev_agents.yaml | 29 ++ .../templates/kagent.dev_modelconfigs.yaml | 20 ++ .../templates/kagent.dev_sandboxagents.yaml | 29 ++ .../kagent-adk/src/kagent/adk/_a2a.py | 9 +- .../src/kagent/adk/_agent_executor.py | 22 +- .../src/kagent/adk/_reflect_retry_plugin.py | 33 ++ .../packages/kagent-adk/src/kagent/adk/cli.py | 23 ++ .../adk/converters/request_converter.py | 7 +- .../src/kagent/adk/models/_anthropic.py | 6 + .../src/kagent/adk/models/_openai.py | 14 + .../kagent-adk/src/kagent/adk/types.py | 40 ++- .../unittests/models/test_sap_ai_core.py | 1 - .../tests/unittests/test_model_retry.py | 74 +++++ .../unittests/test_reliability_config.py | 96 ++++++ ui/src/app/actions/agents.ts | 16 + ui/src/app/agents/new/page.tsx | 108 ++++++- ui/src/app/models/new/page.tsx | 54 +++- ui/src/components/AgentsProvider.tsx | 6 + ui/src/types/index.ts | 17 + 40 files changed, 1762 insertions(+), 34 deletions(-) create mode 100644 go/adk/pkg/runner/adapter_test.go create mode 100644 go/adk/pkg/runner/maxllmcalls.go create mode 100644 go/core/internal/controller/translator/agent/testdata/inputs/agent_with_reliability.yaml create mode 100644 go/core/internal/controller/translator/agent/testdata/inputs/modelconfig_with_retry.yaml create mode 100644 go/core/internal/controller/translator/agent/testdata/outputs/agent_with_reliability.json create mode 100644 go/core/internal/controller/translator/agent/testdata/outputs/modelconfig_with_retry.json create mode 100644 python/packages/kagent-adk/src/kagent/adk/_reflect_retry_plugin.py create mode 100644 python/packages/kagent-adk/tests/unittests/test_model_retry.py create mode 100644 python/packages/kagent-adk/tests/unittests/test_reliability_config.py diff --git a/docs/architecture/crds-and-types.md b/docs/architecture/crds-and-types.md index 7b0a084400..d886e114e2 100644 --- a/docs/architecture/crds-and-types.md +++ b/docs/architecture/crds-and-types.md @@ -74,6 +74,10 @@ AgentSpec │ │ ├── summarizer: ContextSummarizerConfig │ │ ├── tokenThreshold: int │ │ └── eventRetentionSize: int +│ ├── reliability: ReliabilityConfig +│ │ ├── toolRetries: int (reflect-and-retry on failed tool calls) +│ │ ├── maxLLMCalls: int (cap on model calls per request) +│ │ └── debugLogging: bool (log every LLM request/response and tool call) │ └── executeCodeBlocks: bool (currently ignored) │ └── byo: BYOAgentSpec (if type=BYO) @@ -126,6 +130,10 @@ ModelConfigSpec │ ├── caCertSecretKey: string │ └── disableSystemCAs: bool │ +├── retry: ModelRetryConfig +│ └── attempts: int (max retries of failed LLM HTTP requests with exponential backoff; +│ OpenAI/AzureOpenAI/Anthropic/Gemini only) +│ ├── openAI: OpenAIConfig │ ├── baseUrl, temperature, maxTokens, topP │ ├── frequencyPenalty, presencePenalty @@ -340,7 +348,8 @@ When adding a field to an existing CRD, update all layers: 5. **Translator** — `go/core/internal/controller/translator/agent/adk_api_translator.go` (wire field into config) 6. **Python ADK types** — `python/packages/kagent-adk/src/kagent/adk/types.py` (mirror Go types) 7. **Python runtime** — Use the field in agent setup if it affects runtime behavior -8. **Tests** — Translator unit tests (golden files), E2E tests -9. **Helm values** — If exposed to users installing via Helm +8. **Go runtime** — `go/adk/pkg/` (mirror runtime behavior for `runtime: go` agents) +9. **Tests** — Translator unit tests (golden files), E2E tests +10. **Helm values** — If exposed to users installing via Helm See [controller-reconciliation.md](controller-reconciliation.md) for the reconciliation flow and the kagent-dev skill for step-by-step examples. diff --git a/go/adk/pkg/agent/agent.go b/go/adk/pkg/agent/agent.go index 1aae3637d6..a14df02bec 100644 --- a/go/adk/pkg/agent/agent.go +++ b/go/adk/pkg/agent/agent.go @@ -228,6 +228,7 @@ func CreateLLM(ctx context.Context, m adk.Model, log logr.Logger) (adkmodel.LLM, if err != nil { return nil, fmt.Errorf("failed to build HTTP client for Gemini: %w", err) } + warnIgnoredMaxRetries(log, m.BaseModel, "gemini") return adkgemini.NewModel(ctx, modelName, &genai.ClientConfig{ APIKey: apiKey, HTTPClient: httpClient, @@ -246,6 +247,7 @@ func CreateLLM(ctx context.Context, m adk.Model, log logr.Logger) (adkmodel.LLM, if modelName == "" { modelName = DefaultGeminiModel } + warnIgnoredMaxRetries(log, m.BaseModel, "gemini_vertex_ai") return adkgemini.NewModel(ctx, modelName, &genai.ClientConfig{ Backend: genai.BackendVertexAI, Project: project, @@ -278,6 +280,7 @@ func CreateLLM(ctx context.Context, m adk.Model, log logr.Logger) (adkmodel.LLM, modelName = DefaultOllamaModel } // Create OllamaConfig with native SDK support for Ollama-specific options + warnIgnoredMaxRetries(log, m.BaseModel, "ollama") cfg := &models.OllamaConfig{ TransportConfig: transportConfigFromBase(m.BaseModel, nil), Model: modelName, @@ -299,6 +302,7 @@ func CreateLLM(ctx context.Context, m adk.Model, log logr.Logger) (adkmodel.LLM, return nil, fmt.Errorf("bedrock requires a model name (e.g. anthropic.claude-3-sonnet-20240229-v1:0)") } // Use Bedrock Converse API for ALL models (including Anthropic) + warnIgnoredMaxRetries(log, m.BaseModel, "bedrock") cfg := &models.BedrockConfig{ TransportConfig: transportConfigFromBase(m.BaseModel, nil), Model: modelName, @@ -329,6 +333,7 @@ func CreateLLM(ctx context.Context, m adk.Model, log logr.Logger) (adkmodel.LLM, return models.NewAnthropicVertexAIModelWithLogger(ctx, cfg, region, project, log) case *adk.SAPAICore: + warnIgnoredMaxRetries(log, m.BaseModel, "sap_ai_core") cfg := models.SAPAICoreConfig{ Model: m.Model, BaseUrl: m.BaseUrl, @@ -352,6 +357,15 @@ func transportConfigFromBase(b adk.BaseModel, timeout *int) models.TransportConf TLSDisableSystemCAs: b.TLSDisableSystemCAs, APIKeyPassthrough: b.APIKeyPassthrough, Timeout: timeout, + MaxRetries: b.MaxRetries, + } +} + +// warnIgnoredMaxRetries logs a warning when retry configuration is set on a +// provider whose Go SDK does not support configurable HTTP retries. +func warnIgnoredMaxRetries(log logr.Logger, b adk.BaseModel, provider string) { + if b.MaxRetries != nil { + log.Info("Model retry configuration (max_retries) is not supported for this provider in the Go runtime; ignoring", "provider", provider) } } diff --git a/go/adk/pkg/models/anthropic.go b/go/adk/pkg/models/anthropic.go index 0dee13adcb..8ce2162e8f 100644 --- a/go/adk/pkg/models/anthropic.go +++ b/go/adk/pkg/models/anthropic.go @@ -65,6 +65,9 @@ func newAnthropicModelFromConfig(config *AnthropicConfig, apiKey string, logger if config.BaseUrl != "" { opts = append(opts, option.WithBaseURL(config.BaseUrl)) } + if config.MaxRetries != nil { + opts = append(opts, option.WithMaxRetries(*config.MaxRetries)) + } // Create HTTP client with TLS, custom headers, and timeout. httpClient, err := BuildHTTPClient(config.TransportConfig) @@ -95,6 +98,9 @@ func NewAnthropicVertexAIModelWithLogger(ctx context.Context, config *AnthropicC opts := []option.RequestOption{ vertex.WithGoogleAuth(ctx, region, projectID), } + if config.MaxRetries != nil { + opts = append(opts, option.WithMaxRetries(*config.MaxRetries)) + } // Create HTTP client with timeout, custom headers, TLS, and passthrough httpClient, err := BuildHTTPClient(config.TransportConfig) @@ -126,6 +132,9 @@ func NewAnthropicBedrockModelWithLogger(ctx context.Context, config *AnthropicCo awsconfig.WithRegion(region), ), } + if config.MaxRetries != nil { + opts = append(opts, option.WithMaxRetries(*config.MaxRetries)) + } // Create HTTP client with timeout, custom headers, TLS, and passthrough httpClient, err := BuildHTTPClient(config.TransportConfig) diff --git a/go/adk/pkg/models/base.go b/go/adk/pkg/models/base.go index 572687bf2d..b9e9872d19 100644 --- a/go/adk/pkg/models/base.go +++ b/go/adk/pkg/models/base.go @@ -20,6 +20,7 @@ type TransportConfig struct { TLSDisableSystemCAs *bool APIKeyPassthrough bool Timeout *int // seconds; nil = defaultTimeout + MaxRetries *int // HTTP retry attempts for transient failures; nil = SDK default } // BuildHTTPClient creates an http.Client with the full transport stack: diff --git a/go/adk/pkg/models/openai.go b/go/adk/pkg/models/openai.go index fc975cce00..4f959d2141 100644 --- a/go/adk/pkg/models/openai.go +++ b/go/adk/pkg/models/openai.go @@ -84,6 +84,9 @@ func newOpenAIModelFromConfig(config *OpenAIConfig, apiKey string, logger logr.L if config.BaseUrl != "" { opts = append(opts, option.WithBaseURL(config.BaseUrl)) } + if config.MaxRetries != nil { + opts = append(opts, option.WithMaxRetries(*config.MaxRetries)) + } httpClient, err := BuildHTTPClient(config.TransportConfig) if err != nil { return nil, err @@ -123,6 +126,9 @@ func NewAzureOpenAIModelWithLogger(config *AzureOpenAIConfig, logger logr.Logger option.WithQueryAdd("api-version", apiVersion), option.WithMiddleware(azurePathRewriteMiddleware()), } + if config.MaxRetries != nil { + opts = append(opts, option.WithMaxRetries(*config.MaxRetries)) + } if !config.APIKeyPassthrough { apiKey := os.Getenv("AZURE_OPENAI_API_KEY") diff --git a/go/adk/pkg/runner/adapter.go b/go/adk/pkg/runner/adapter.go index 0441f778c0..f13bfa44c9 100644 --- a/go/adk/pkg/runner/adapter.go +++ b/go/adk/pkg/runner/adapter.go @@ -14,6 +14,8 @@ import ( "github.com/kagent-dev/kagent/go/api/adk" adkmemory "google.golang.org/adk/memory" adkplugin "google.golang.org/adk/plugin" + "google.golang.org/adk/plugin/loggingplugin" + "google.golang.org/adk/plugin/retryandreflect" "google.golang.org/adk/runner" adksession "google.golang.org/adk/session" adktool "google.golang.org/adk/tool" @@ -83,6 +85,12 @@ func CreateRunnerConfig( } } + reliabilityPlugins, err := buildReliabilityPlugins(agentConfig.Reliability, log) + if err != nil { + return runner.Config{}, nil, err + } + adkPlugins = append(adkPlugins, reliabilityPlugins...) + cfg := runner.Config{ AppName: appName, Agent: adkAgent, @@ -96,6 +104,49 @@ func CreateRunnerConfig( return cfg, subagentSessionIDs, nil } +// buildReliabilityPlugins translates the agent reliability configuration into +// ADK plugins: debug logging, tool retry (reflect-and-retry), and a max LLM +// calls limit. +func buildReliabilityPlugins(r *adk.ReliabilityConfig, log logr.Logger) ([]*adkplugin.Plugin, error) { + if r == nil { + return nil, nil + } + + var plugins []*adkplugin.Plugin + + if r.DebugLogging != nil && *r.DebugLogging { + p, err := loggingplugin.New("kagent_debug_logging") + if err != nil { + return nil, fmt.Errorf("failed to create debug logging plugin: %w", err) + } + plugins = append(plugins, p) + log.Info("Debug logging enabled for agent") + } + + if r.ToolRetries != nil && *r.ToolRetries > 0 { + p, err := retryandreflect.New( + retryandreflect.WithMaxRetries(*r.ToolRetries), + retryandreflect.WithErrorIfRetryExceeded(false), + ) + if err != nil { + return nil, fmt.Errorf("failed to create tool retry plugin: %w", err) + } + plugins = append(plugins, p) + log.Info("Tool retry enabled for agent", "toolRetries", *r.ToolRetries) + } + + if r.MaxLLMCalls != nil && *r.MaxLLMCalls > 0 { + p, err := newMaxLLMCallsPlugin(*r.MaxLLMCalls) + if err != nil { + return nil, fmt.Errorf("failed to create max LLM calls plugin: %w", err) + } + plugins = append(plugins, p) + log.Info("Max LLM calls limit enabled for agent", "maxLLMCalls", *r.MaxLLMCalls) + } + + return plugins, nil +} + func buildTokenPropagationPlugin(ctx context.Context, log logr.Logger) (*sts.TokenPropagationPlugin, error) { propagateToken := strings.EqualFold(strings.TrimSpace(os.Getenv("KAGENT_PROPAGATE_TOKEN")), "true") stsWellKnownURI := strings.TrimSpace(os.Getenv("STS_WELL_KNOWN_URI")) diff --git a/go/adk/pkg/runner/adapter_test.go b/go/adk/pkg/runner/adapter_test.go new file mode 100644 index 0000000000..b15ab10679 --- /dev/null +++ b/go/adk/pkg/runner/adapter_test.go @@ -0,0 +1,128 @@ +package runner + +import ( + "context" + "strings" + "testing" + + "github.com/go-logr/logr" + "github.com/kagent-dev/kagent/go/api/adk" + "google.golang.org/adk/agent" + "google.golang.org/adk/session" + "google.golang.org/genai" +) + +func TestBuildReliabilityPlugins(t *testing.T) { + tests := []struct { + name string + config *adk.ReliabilityConfig + wantNames []string + }{ + { + name: "nil config", + config: nil, + wantNames: nil, + }, + { + name: "empty config", + config: &adk.ReliabilityConfig{}, + wantNames: nil, + }, + { + name: "debug logging only", + config: &adk.ReliabilityConfig{DebugLogging: new(true)}, + wantNames: []string{"kagent_debug_logging"}, + }, + { + name: "debug logging false", + config: &adk.ReliabilityConfig{DebugLogging: new(false)}, + wantNames: nil, + }, + { + name: "tool retries only", + config: &adk.ReliabilityConfig{ToolRetries: new(3)}, + wantNames: []string{"RetryAndReflectPlugin"}, + }, + { + name: "max llm calls only", + config: &adk.ReliabilityConfig{MaxLLMCalls: new(10)}, + wantNames: []string{"kagent_max_llm_calls"}, + }, + { + name: "all enabled", + config: &adk.ReliabilityConfig{ + ToolRetries: new(2), + MaxLLMCalls: new(50), + DebugLogging: new(true), + }, + wantNames: []string{"kagent_debug_logging", "RetryAndReflectPlugin", "kagent_max_llm_calls"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + plugins, err := buildReliabilityPlugins(tt.config, logr.Discard()) + if err != nil { + t.Fatalf("buildReliabilityPlugins() error = %v", err) + } + if len(plugins) != len(tt.wantNames) { + t.Fatalf("got %d plugins, want %d", len(plugins), len(tt.wantNames)) + } + for i, want := range tt.wantNames { + if got := plugins[i].Name(); got != want { + t.Errorf("plugin[%d].Name() = %q, want %q", i, got, want) + } + } + }) + } +} + +// fakeCallbackContext is a minimal agent.CallbackContext for testing. +type fakeCallbackContext struct { + context.Context + invocationID string +} + +func (f *fakeCallbackContext) UserContent() *genai.Content { return nil } +func (f *fakeCallbackContext) InvocationID() string { return f.invocationID } +func (f *fakeCallbackContext) AgentName() string { return "test-agent" } +func (f *fakeCallbackContext) ReadonlyState() session.ReadonlyState { return nil } +func (f *fakeCallbackContext) UserID() string { return "user" } +func (f *fakeCallbackContext) AppName() string { return "app" } +func (f *fakeCallbackContext) SessionID() string { return "session" } +func (f *fakeCallbackContext) Branch() string { return "" } +func (f *fakeCallbackContext) Artifacts() agent.Artifacts { return nil } +func (f *fakeCallbackContext) State() session.State { return nil } + +func TestMaxLLMCallsPlugin(t *testing.T) { + p, err := newMaxLLMCallsPlugin(2) + if err != nil { + t.Fatalf("newMaxLLMCallsPlugin() error = %v", err) + } + cb := p.BeforeModelCallback() + if cb == nil { + t.Fatal("BeforeModelCallback is nil") + } + + ctxA := &fakeCallbackContext{Context: context.Background(), invocationID: "inv-a"} + ctxB := &fakeCallbackContext{Context: context.Background(), invocationID: "inv-b"} + + // First two calls within the limit succeed. + for i := range 2 { + if _, err := cb(ctxA, nil); err != nil { + t.Fatalf("call %d: unexpected error: %v", i+1, err) + } + } + + // Third call exceeds the limit. + if _, err := cb(ctxA, nil); err == nil { + t.Fatal("expected error after exceeding limit, got nil") + } else if !strings.Contains(err.Error(), "limit of 2 model calls") { + t.Errorf("unexpected error message: %v", err) + } + + // A different invocation has its own counter. + if _, err := cb(ctxB, nil); err != nil { + t.Fatalf("different invocation should not be limited: %v", err) + } +} diff --git a/go/adk/pkg/runner/maxllmcalls.go b/go/adk/pkg/runner/maxllmcalls.go new file mode 100644 index 0000000000..38b024d3f0 --- /dev/null +++ b/go/adk/pkg/runner/maxllmcalls.go @@ -0,0 +1,42 @@ +package runner + +import ( + "fmt" + "sync" + + "google.golang.org/adk/agent" + "google.golang.org/adk/model" + adkplugin "google.golang.org/adk/plugin" +) + +// newMaxLLMCallsPlugin returns a plugin that enforces a limit on the number of +// LLM calls within a single invocation. The Go ADK has no native equivalent of +// the Python RunConfig.max_llm_calls, so this is implemented as a +// BeforeModelCallback that counts model calls per invocation ID and aborts the +// run when the limit is exceeded. +func newMaxLLMCallsPlugin(limit int) (*adkplugin.Plugin, error) { + var mu sync.Mutex + counts := make(map[string]int) + + return adkplugin.New(adkplugin.Config{ + Name: "kagent_max_llm_calls", + BeforeModelCallback: func(ctx agent.CallbackContext, llmRequest *model.LLMRequest) (*model.LLMResponse, error) { + mu.Lock() + defer mu.Unlock() + id := ctx.InvocationID() + counts[id]++ + if counts[id] > limit { + return nil, fmt.Errorf( + "agent stopped: exceeded the configured limit of %d model calls in a single run (reliability.maxLLMCalls)", + limit, + ) + } + return nil, nil + }, + AfterRunCallback: func(ictx agent.InvocationContext) { + mu.Lock() + defer mu.Unlock() + delete(counts, ictx.InvocationID()) + }, + }) +} diff --git a/go/api/adk/types.go b/go/api/adk/types.go index 4f15f9f899..6d739e0b46 100644 --- a/go/api/adk/types.go +++ b/go/api/adk/types.go @@ -61,6 +61,11 @@ type BaseModel struct { // APIKeyPassthrough enables forwarding the Bearer token from incoming requests // as the LLM API key instead of using a static secret. APIKeyPassthrough bool `json:"api_key_passthrough,omitempty"` + + // MaxRetries is the maximum number of times a failed LLM HTTP request + // (429 rate limit, 408 timeout, transient 5xx) is retried with + // exponential backoff by the provider SDK. Nil means SDK default. + MaxRetries *int `json:"max_retries,omitempty"` } // GDCHTokenExchangeConfig holds the GDCH-specific token exchange fields @@ -442,6 +447,19 @@ type NetworkConfig struct { AllowedDomains []string `json:"allowed_domains,omitempty"` } +// ReliabilityConfig groups reliability-related configuration that flows +// through config.json to the Python runtime. +type ReliabilityConfig struct { + // ToolRetries is the maximum number of consecutive failures for a tool + // call before the runtime stops retrying it (reflect-and-retry). + ToolRetries *int `json:"tool_retries,omitempty"` + // MaxLLMCalls caps the total number of model calls per request. + MaxLLMCalls *int `json:"max_llm_calls,omitempty"` + // DebugLogging enables verbose logging of every LLM request/response + // and tool call. + DebugLogging *bool `json:"debug_logging,omitempty"` +} + // AgentContextConfig is the context management configuration that flows through config.json to the Python runtime. type AgentContextConfig struct { Compaction *AgentCompressionConfig `json:"compaction,omitempty"` @@ -497,6 +515,7 @@ type AgentConfig struct { Memory *MemoryConfig `json:"memory,omitempty"` Network *NetworkConfig `json:"network,omitempty"` ContextConfig *AgentContextConfig `json:"context_config,omitempty"` + Reliability *ReliabilityConfig `json:"reliability,omitempty"` } // GetStream returns the stream value or default if not set @@ -528,6 +547,7 @@ func (a *AgentConfig) UnmarshalJSON(data []byte) error { Memory json.RawMessage `json:"memory"` Network *NetworkConfig `json:"network,omitempty"` ContextConfig *AgentContextConfig `json:"context_config,omitempty"` + Reliability *ReliabilityConfig `json:"reliability,omitempty"` } if err := json.Unmarshal(data, &tmp); err != nil { return err @@ -557,6 +577,7 @@ func (a *AgentConfig) UnmarshalJSON(data []byte) error { a.Memory = memory a.Network = tmp.Network a.ContextConfig = tmp.ContextConfig + a.Reliability = tmp.Reliability return nil } diff --git a/go/api/config/crd/bases/kagent.dev_agents.yaml b/go/api/config/crd/bases/kagent.dev_agents.yaml index 8338f8e182..4d4a50483f 100644 --- a/go/api/config/crd/bases/kagent.dev_agents.yaml +++ b/go/api/config/crd/bases/kagent.dev_agents.yaml @@ -13110,6 +13110,35 @@ spec: maxItems: 20 type: array type: object + reliability: + description: |- + Reliability configures self-healing and observability behaviors for the + agent runtime, such as reflect-and-retry on failed tool calls, model call + caps, and debug logging. + properties: + debugLogging: + description: |- + DebugLogging enables verbose runtime logging of every LLM + request/response and tool call to the agent pod logs. + Useful for debugging agent behavior; off by default. + type: boolean + maxLLMCalls: + description: |- + MaxLLMCalls caps the total number of model calls per request (cost safety + rail). When the cap is exceeded the run stops with a clear error instead + of looping. If unset, the runtime default applies (500). + minimum: 1 + type: integer + toolRetries: + description: |- + ToolRetries is the maximum number of consecutive failures for a tool call + before the agent stops retrying it. When set, failed tool calls are followed + by structured reflection guidance injected into the model context so the + agent can self-correct instead of repeating the same failing call. + maximum: 10 + minimum: 1 + type: integer + type: object runtime: default: python description: |- diff --git a/go/api/config/crd/bases/kagent.dev_modelconfigs.yaml b/go/api/config/crd/bases/kagent.dev_modelconfigs.yaml index ce185907d9..966ea72a9c 100644 --- a/go/api/config/crd/bases/kagent.dev_modelconfigs.yaml +++ b/go/api/config/crd/bases/kagent.dev_modelconfigs.yaml @@ -628,6 +628,26 @@ spec: - Bedrock - SAPAICore type: string + retry: + description: |- + Retry configures automatic retries of failed LLM HTTP requests + (e.g. 429 rate limits, transient 5xx errors, timeouts) with + exponential backoff, handled by the provider SDK. + properties: + attempts: + description: |- + Attempts is the maximum number of retry attempts after the initial + request fails. Retried errors include rate limits (429), request + timeouts (408), and transient server errors (5xx); backoff between + attempts is exponential. Set to 0 to disable retries entirely. + Supported for the OpenAI, AzureOpenAI, Anthropic, and Gemini providers; + ignored by other providers. + maximum: 20 + minimum: 0 + type: integer + required: + - attempts + type: object sapAICore: description: SAP AI Core-specific configuration properties: diff --git a/go/api/config/crd/bases/kagent.dev_sandboxagents.yaml b/go/api/config/crd/bases/kagent.dev_sandboxagents.yaml index 3f8f594b50..516904f45e 100644 --- a/go/api/config/crd/bases/kagent.dev_sandboxagents.yaml +++ b/go/api/config/crd/bases/kagent.dev_sandboxagents.yaml @@ -10767,6 +10767,35 @@ spec: maxItems: 20 type: array type: object + reliability: + description: |- + Reliability configures self-healing and observability behaviors for the + agent runtime, such as reflect-and-retry on failed tool calls, model call + caps, and debug logging. + properties: + debugLogging: + description: |- + DebugLogging enables verbose runtime logging of every LLM + request/response and tool call to the agent pod logs. + Useful for debugging agent behavior; off by default. + type: boolean + maxLLMCalls: + description: |- + MaxLLMCalls caps the total number of model calls per request (cost safety + rail). When the cap is exceeded the run stops with a clear error instead + of looping. If unset, the runtime default applies (500). + minimum: 1 + type: integer + toolRetries: + description: |- + ToolRetries is the maximum number of consecutive failures for a tool call + before the agent stops retrying it. When set, failed tool calls are followed + by structured reflection guidance injected into the model context so the + agent can self-correct instead of repeating the same failing call. + maximum: 10 + minimum: 1 + type: integer + type: object runtime: default: python description: |- diff --git a/go/api/v1alpha2/agent_types.go b/go/api/v1alpha2/agent_types.go index ebfdcd7325..10556fc63d 100644 --- a/go/api/v1alpha2/agent_types.go +++ b/go/api/v1alpha2/agent_types.go @@ -228,6 +228,35 @@ type DeclarativeAgentSpec struct { // This includes event compaction (compression) and context caching. // +optional Context *ContextConfig `json:"context,omitempty"` + + // Reliability configures self-healing and observability behaviors for the + // agent runtime, such as reflect-and-retry on failed tool calls, model call + // caps, and debug logging. + // +optional + Reliability *ReliabilityConfig `json:"reliability,omitempty"` +} + +// ReliabilityConfig configures self-healing and observability behaviors for the agent runtime. +type ReliabilityConfig struct { + // ToolRetries is the maximum number of consecutive failures for a tool call + // before the agent stops retrying it. When set, failed tool calls are followed + // by structured reflection guidance injected into the model context so the + // agent can self-correct instead of repeating the same failing call. + // +optional + // +kubebuilder:validation:Minimum=1 + // +kubebuilder:validation:Maximum=10 + ToolRetries *int `json:"toolRetries,omitempty"` + // MaxLLMCalls caps the total number of model calls per request (cost safety + // rail). When the cap is exceeded the run stops with a clear error instead + // of looping. If unset, the runtime default applies (500). + // +optional + // +kubebuilder:validation:Minimum=1 + MaxLLMCalls *int `json:"maxLLMCalls,omitempty"` + // DebugLogging enables verbose runtime logging of every LLM + // request/response and tool call to the agent pod logs. + // Useful for debugging agent behavior; off by default. + // +optional + DebugLogging bool `json:"debugLogging,omitempty"` } // SandboxPlatform selects the control plane for sandboxed agents. diff --git a/go/api/v1alpha2/modelconfig_types.go b/go/api/v1alpha2/modelconfig_types.go index 3536611744..11e9e62bc8 100644 --- a/go/api/v1alpha2/modelconfig_types.go +++ b/go/api/v1alpha2/modelconfig_types.go @@ -419,6 +419,26 @@ type ModelConfigSpec struct { // that use self-signed certificates or custom certificate authorities. // +optional TLS *TLSConfig `json:"tls,omitempty"` + + // Retry configures automatic retries of failed LLM HTTP requests + // (e.g. 429 rate limits, transient 5xx errors, timeouts) with + // exponential backoff, handled by the provider SDK. + // +optional + Retry *ModelRetryConfig `json:"retry,omitempty"` +} + +// ModelRetryConfig configures automatic retries of failed LLM HTTP requests. +type ModelRetryConfig struct { + // Attempts is the maximum number of retry attempts after the initial + // request fails. Retried errors include rate limits (429), request + // timeouts (408), and transient server errors (5xx); backoff between + // attempts is exponential. Set to 0 to disable retries entirely. + // Supported for the OpenAI, AzureOpenAI, Anthropic, and Gemini providers; + // ignored by other providers. + // +required + // +kubebuilder:validation:Minimum=0 + // +kubebuilder:validation:Maximum=20 + Attempts int `json:"attempts"` } // ModelConfigStatus defines the observed state of ModelConfig. diff --git a/go/api/v1alpha2/zz_generated.deepcopy.go b/go/api/v1alpha2/zz_generated.deepcopy.go index 2c03020b70..3d5d35c354 100644 --- a/go/api/v1alpha2/zz_generated.deepcopy.go +++ b/go/api/v1alpha2/zz_generated.deepcopy.go @@ -874,6 +874,11 @@ func (in *DeclarativeAgentSpec) DeepCopyInto(out *DeclarativeAgentSpec) { *out = new(ContextConfig) (*in).DeepCopyInto(*out) } + if in.Reliability != nil { + in, out := &in.Reliability, &out.Reliability + *out = new(ReliabilityConfig) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeclarativeAgentSpec. @@ -1143,6 +1148,11 @@ func (in *ModelConfigSpec) DeepCopyInto(out *ModelConfigSpec) { *out = new(TLSConfig) **out = **in } + if in.Retry != nil { + in, out := &in.Retry, &out.Retry + *out = new(ModelRetryConfig) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ModelConfigSpec. @@ -1287,6 +1297,21 @@ func (in *ModelProviderConfigStatus) DeepCopy() *ModelProviderConfigStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ModelRetryConfig) DeepCopyInto(out *ModelRetryConfig) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ModelRetryConfig. +func (in *ModelRetryConfig) DeepCopy() *ModelRetryConfig { + if in == nil { + return nil + } + out := new(ModelRetryConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NetworkConfig) DeepCopyInto(out *NetworkConfig) { *out = *in @@ -1405,6 +1430,31 @@ func (in *PromptTemplateSpec) DeepCopy() *PromptTemplateSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ReliabilityConfig) DeepCopyInto(out *ReliabilityConfig) { + *out = *in + if in.ToolRetries != nil { + in, out := &in.ToolRetries, &out.ToolRetries + *out = new(int) + **out = **in + } + if in.MaxLLMCalls != nil { + in, out := &in.MaxLLMCalls, &out.MaxLLMCalls + *out = new(int) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ReliabilityConfig. +func (in *ReliabilityConfig) DeepCopy() *ReliabilityConfig { + if in == nil { + return nil + } + out := new(ReliabilityConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RemoteMCPServer) DeepCopyInto(out *RemoteMCPServer) { *out = *in diff --git a/go/core/internal/controller/translator/agent/adk_api_translator.go b/go/core/internal/controller/translator/agent/adk_api_translator.go index 61e2892f7f..45f30bbfe0 100644 --- a/go/core/internal/controller/translator/agent/adk_api_translator.go +++ b/go/core/internal/controller/translator/agent/adk_api_translator.go @@ -310,14 +310,19 @@ func deriveTLSFields(tlsConfig *v1alpha2.TLSConfig) (*bool, *string, *bool) { return insecureSkipVerify, caCertPath, disableSystemCAs } -// populateTLSFields writes the derived TLS fields onto an adk.BaseModel. -// Used by every model-provider branch in translateBaseModel — each provider -// embeds BaseModel, so this single call replaces the explicit three-field +// populateBaseModelFields writes the provider-agnostic ModelConfig fields +// (TLS and retry) onto an adk.BaseModel. +// Used by every model-provider branch in translateModel — each provider +// embeds BaseModel, so this single call replaces the explicit per-field // assignment at each site. The MCP-connection params (StreamableHTTPConnectionParams, -// SseConnectionParams) carry the same three fields but do not embed BaseModel; +// SseConnectionParams) carry the same three TLS fields but do not embed BaseModel; // those callers assign through deriveTLSFields directly. -func populateTLSFields(baseModel *adk.BaseModel, tlsConfig *v1alpha2.TLSConfig) { - baseModel.TLSInsecureSkipVerify, baseModel.TLSCACertPath, baseModel.TLSDisableSystemCAs = deriveTLSFields(tlsConfig) +func populateBaseModelFields(baseModel *adk.BaseModel, spec *v1alpha2.ModelConfigSpec) { + baseModel.TLSInsecureSkipVerify, baseModel.TLSCACertPath, baseModel.TLSDisableSystemCAs = deriveTLSFields(spec.TLS) + if spec.Retry != nil { + attempts := spec.Retry.Attempts + baseModel.MaxRetries = &attempts + } } // addTLSConfiguration mounts a CA Secret as a per-Secret read-only volume on @@ -471,8 +476,8 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC Headers: model.Spec.DefaultHeaders, }, } - // Populate TLS fields in BaseModel - populateTLSFields(&openai.BaseModel, model.Spec.TLS) + // Populate shared fields (TLS, retry) in BaseModel + populateBaseModelFields(&openai.BaseModel, &model.Spec) // Populate TokenExchange fields (OpenAI-specific) addTokenExchangeConfiguration(openai, modelDeploymentData, &model.Spec) openai.APIKeyPassthrough = model.Spec.APIKeyPassthrough @@ -529,8 +534,8 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC Headers: model.Spec.DefaultHeaders, }, } - // Populate TLS fields in BaseModel - populateTLSFields(&anthropic.BaseModel, model.Spec.TLS) + // Populate shared fields (TLS, retry) in BaseModel + populateBaseModelFields(&anthropic.BaseModel, &model.Spec) anthropic.APIKeyPassthrough = model.Spec.APIKeyPassthrough if model.Spec.Anthropic != nil { @@ -587,8 +592,8 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC Headers: model.Spec.DefaultHeaders, }, } - // Populate TLS fields in BaseModel - populateTLSFields(&azureOpenAI.BaseModel, model.Spec.TLS) + // Populate shared fields (TLS, retry) in BaseModel + populateBaseModelFields(&azureOpenAI.BaseModel, &model.Spec) azureOpenAI.APIKeyPassthrough = model.Spec.APIKeyPassthrough return azureOpenAI, modelDeploymentData, secretHashBytes, nil @@ -632,8 +637,8 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC Headers: model.Spec.DefaultHeaders, }, } - // Populate TLS fields in BaseModel - populateTLSFields(&gemini.BaseModel, model.Spec.TLS) + // Populate shared fields (TLS, retry) in BaseModel + populateBaseModelFields(&gemini.BaseModel, &model.Spec) gemini.APIKeyPassthrough = model.Spec.APIKeyPassthrough return gemini, modelDeploymentData, secretHashBytes, nil @@ -673,8 +678,8 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC Headers: model.Spec.DefaultHeaders, }, } - // Populate TLS fields in BaseModel - populateTLSFields(&anthropic.BaseModel, model.Spec.TLS) + // Populate shared fields (TLS, retry) in BaseModel + populateBaseModelFields(&anthropic.BaseModel, &model.Spec) anthropic.APIKeyPassthrough = model.Spec.APIKeyPassthrough return anthropic, modelDeploymentData, secretHashBytes, nil @@ -697,8 +702,8 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC }, Options: model.Spec.Ollama.Options, } - // Populate TLS fields in BaseModel - populateTLSFields(&ollama.BaseModel, model.Spec.TLS) + // Populate shared fields (TLS, retry) in BaseModel + populateBaseModelFields(&ollama.BaseModel, &model.Spec) ollama.APIKeyPassthrough = model.Spec.APIKeyPassthrough return ollama, modelDeploymentData, secretHashBytes, nil @@ -720,8 +725,8 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC Headers: model.Spec.DefaultHeaders, }, } - // Populate TLS fields in BaseModel - populateTLSFields(&gemini.BaseModel, model.Spec.TLS) + // Populate shared fields (TLS, retry) in BaseModel + populateBaseModelFields(&gemini.BaseModel, &model.Spec) return gemini, modelDeploymentData, secretHashBytes, nil case v1alpha2.ModelProviderBedrock: if model.Spec.Bedrock == nil { @@ -808,8 +813,8 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC AdditionalModelRequestFields: additionalFields, } - // Populate TLS fields in BaseModel - populateTLSFields(&bedrock.BaseModel, model.Spec.TLS) + // Populate shared fields (TLS, retry) in BaseModel + populateBaseModelFields(&bedrock.BaseModel, &model.Spec) bedrock.APIKeyPassthrough = model.Spec.APIKeyPassthrough return bedrock, modelDeploymentData, secretHashBytes, nil @@ -858,7 +863,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC AuthUrl: model.Spec.SAPAICore.AuthURL, } - populateTLSFields(&sapAICore.BaseModel, model.Spec.TLS) + populateBaseModelFields(&sapAICore.BaseModel, &model.Spec) sapAICore.APIKeyPassthrough = model.Spec.APIKeyPassthrough return sapAICore, modelDeploymentData, secretHashBytes, nil diff --git a/go/core/internal/controller/translator/agent/compiler.go b/go/core/internal/controller/translator/agent/compiler.go index 9596e28db5..a6da613c65 100644 --- a/go/core/internal/controller/translator/agent/compiler.go +++ b/go/core/internal/controller/translator/agent/compiler.go @@ -187,6 +187,19 @@ func (a *adkApiTranslator) translateInlineAgent(ctx context.Context, agent v1alp } } + // Translate reliability configuration (reflect-and-retry on tool failures, + // LLM call cap, debug logging). + if r := spec.Declarative.Reliability; r != nil && (r.ToolRetries != nil || r.MaxLLMCalls != nil || r.DebugLogging) { + cfg.Reliability = &adk.ReliabilityConfig{ + ToolRetries: r.ToolRetries, + MaxLLMCalls: r.MaxLLMCalls, + } + if r.DebugLogging { + debugLogging := true + cfg.Reliability.DebugLogging = &debugLogging + } + } + // Translate context management configuration if spec.Declarative.Context != nil { contextCfg := &adk.AgentContextConfig{} diff --git a/go/core/internal/controller/translator/agent/testdata/inputs/agent_with_reliability.yaml b/go/core/internal/controller/translator/agent/testdata/inputs/agent_with_reliability.yaml new file mode 100644 index 0000000000..9ca36a105d --- /dev/null +++ b/go/core/internal/controller/translator/agent/testdata/inputs/agent_with_reliability.yaml @@ -0,0 +1,37 @@ +operation: translateAgent +targetObject: basic-agent +namespace: test +objects: + - apiVersion: v1 + kind: Secret + metadata: + name: openai-secret + namespace: test + data: + api-key: c2stdGVzdC1hcGkta2V5 # base64 encoded "sk-test-api-key" + - apiVersion: kagent.dev/v1alpha2 + kind: ModelConfig + metadata: + name: basic-model + namespace: test + spec: + provider: OpenAI + model: gpt-4o + apiKeySecret: openai-secret + apiKeySecretKey: api-key + - apiVersion: kagent.dev/v1alpha2 + kind: Agent + metadata: + name: basic-agent + namespace: test + spec: + type: Declarative + declarative: + description: A test agent with reliability and debug logging enabled + systemMessage: You are a helpful assistant. + modelConfig: basic-model + reliability: + toolRetries: 3 + maxLLMCalls: 25 + debugLogging: true + tools: [] diff --git a/go/core/internal/controller/translator/agent/testdata/inputs/modelconfig_with_retry.yaml b/go/core/internal/controller/translator/agent/testdata/inputs/modelconfig_with_retry.yaml new file mode 100644 index 0000000000..baf8fd4341 --- /dev/null +++ b/go/core/internal/controller/translator/agent/testdata/inputs/modelconfig_with_retry.yaml @@ -0,0 +1,35 @@ +operation: translateAgent +targetObject: basic-agent +namespace: test +objects: + - apiVersion: v1 + kind: Secret + metadata: + name: openai-secret + namespace: test + data: + api-key: c2stdGVzdC1hcGkta2V5 # base64 encoded "sk-test-api-key" + - apiVersion: kagent.dev/v1alpha2 + kind: ModelConfig + metadata: + name: retry-model + namespace: test + spec: + provider: OpenAI + model: gpt-4o + apiKeySecret: openai-secret + apiKeySecretKey: api-key + retry: + attempts: 5 + - apiVersion: kagent.dev/v1alpha2 + kind: Agent + metadata: + name: basic-agent + namespace: test + spec: + type: Declarative + declarative: + description: A test agent whose model retries failed LLM HTTP requests + systemMessage: You are a helpful assistant. + modelConfig: retry-model + tools: [] diff --git a/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_reliability.json b/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_reliability.json new file mode 100644 index 0000000000..91b984f2af --- /dev/null +++ b/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_reliability.json @@ -0,0 +1,299 @@ +{ + "agentCard": { + "capabilities": { + "streaming": true + }, + "defaultInputModes": [ + "text" + ], + "defaultOutputModes": [ + "text" + ], + "description": "", + "name": "basic_agent", + "skills": null, + "supportedInterfaces": [ + { + "protocolBinding": "JSONRPC", + "protocolVersion": "0.3", + "url": "http://basic-agent.test:8080" + }, + { + "protocolBinding": "JSONRPC", + "protocolVersion": "1.0", + "url": "http://basic-agent.test:8080" + } + ], + "version": "" + }, + "config": { + "description": "", + "instruction": "You are a helpful assistant.", + "model": { + "base_url": "", + "model": "gpt-4o", + "type": "openai" + }, + "reliability": { + "debug_logging": true, + "max_llm_calls": 25, + "tool_retries": 3 + }, + "stream": false + }, + "manifest": [ + { + "apiVersion": "v1", + "kind": "Secret", + "metadata": { + "labels": { + "app": "kagent", + "app.kubernetes.io/managed-by": "kagent", + "app.kubernetes.io/name": "basic-agent", + "app.kubernetes.io/part-of": "kagent", + "kagent": "basic-agent" + }, + "name": "basic-agent", + "namespace": "test", + "ownerReferences": [ + { + "apiVersion": "kagent.dev/v1alpha2", + "blockOwnerDeletion": true, + "controller": true, + "kind": "Agent", + "name": "basic-agent", + "uid": "" + } + ] + }, + "stringData": { + "agent-card.json": "{\n \"defaultInputModes\": [\n \"text\"\n ],\n \"defaultOutputModes\": [\n \"text\"\n ],\n \"description\": \"\",\n \"name\": \"basic_agent\",\n \"version\": \"\",\n \"skills\": [],\n \"capabilities\": {\n \"streaming\": true\n },\n \"supportedInterfaces\": [\n {\n \"url\": \"http://basic-agent.test:8080\",\n \"protocolBinding\": \"JSONRPC\",\n \"protocolVersion\": \"0.3\"\n },\n {\n \"url\": \"http://basic-agent.test:8080\",\n \"protocolBinding\": \"JSONRPC\",\n \"protocolVersion\": \"1.0\"\n }\n ],\n \"url\": \"http://basic-agent.test:8080\",\n \"protocolVersion\": \"0.3\",\n \"preferredTransport\": \"JSONRPC\"\n}", + "config.json": "{\"model\":{\"type\":\"openai\",\"model\":\"gpt-4o\",\"base_url\":\"\"},\"description\":\"\",\"instruction\":\"You are a helpful assistant.\",\"stream\":false,\"reliability\":{\"tool_retries\":3,\"max_llm_calls\":25,\"debug_logging\":true}}" + } + }, + { + "apiVersion": "v1", + "kind": "ServiceAccount", + "metadata": { + "labels": { + "app": "kagent", + "app.kubernetes.io/managed-by": "kagent", + "app.kubernetes.io/name": "basic-agent", + "app.kubernetes.io/part-of": "kagent", + "kagent": "basic-agent" + }, + "name": "basic-agent", + "namespace": "test", + "ownerReferences": [ + { + "apiVersion": "kagent.dev/v1alpha2", + "blockOwnerDeletion": true, + "controller": true, + "kind": "Agent", + "name": "basic-agent", + "uid": "" + } + ] + } + }, + { + "apiVersion": "apps/v1", + "kind": "Deployment", + "metadata": { + "labels": { + "app": "kagent", + "app.kubernetes.io/managed-by": "kagent", + "app.kubernetes.io/name": "basic-agent", + "app.kubernetes.io/part-of": "kagent", + "kagent": "basic-agent" + }, + "name": "basic-agent", + "namespace": "test", + "ownerReferences": [ + { + "apiVersion": "kagent.dev/v1alpha2", + "blockOwnerDeletion": true, + "controller": true, + "kind": "Agent", + "name": "basic-agent", + "uid": "" + } + ] + }, + "spec": { + "selector": { + "matchLabels": { + "app": "kagent", + "kagent": "basic-agent" + } + }, + "strategy": { + "rollingUpdate": { + "maxSurge": 1, + "maxUnavailable": 0 + }, + "type": "RollingUpdate" + }, + "template": { + "metadata": { + "annotations": { + "kagent.dev/config-hash": "17561973486592316877" + }, + "labels": { + "app": "kagent", + "app.kubernetes.io/managed-by": "kagent", + "app.kubernetes.io/name": "basic-agent", + "app.kubernetes.io/part-of": "kagent", + "kagent": "basic-agent" + } + }, + "spec": { + "containers": [ + { + "args": [ + "--host", + "0.0.0.0", + "--port", + "8080", + "--filepath", + "/config" + ], + "env": [ + { + "name": "OPENAI_API_KEY", + "valueFrom": { + "secretKeyRef": { + "key": "api-key", + "name": "openai-secret" + } + } + }, + { + "name": "KAGENT_NAMESPACE", + "valueFrom": { + "fieldRef": { + "fieldPath": "metadata.namespace" + } + } + }, + { + "name": "KAGENT_NAME", + "value": "basic-agent" + }, + { + "name": "KAGENT_URL", + "value": "http://kagent-controller.kagent:8083" + } + ], + "image": "cr.kagent.dev/kagent-dev/kagent/app@sha256:test-app", + "imagePullPolicy": "IfNotPresent", + "name": "kagent", + "ports": [ + { + "containerPort": 8080, + "name": "http" + } + ], + "readinessProbe": { + "httpGet": { + "path": "/.well-known/agent-card.json", + "port": "http" + }, + "initialDelaySeconds": 15, + "periodSeconds": 15, + "timeoutSeconds": 15 + }, + "resources": { + "limits": { + "cpu": "2", + "memory": "1Gi" + }, + "requests": { + "cpu": "100m", + "memory": "384Mi" + } + }, + "volumeMounts": [ + { + "mountPath": "/config", + "name": "config" + }, + { + "mountPath": "/var/run/secrets/tokens", + "name": "kagent-token" + } + ] + } + ], + "serviceAccountName": "basic-agent", + "volumes": [ + { + "name": "config", + "secret": { + "secretName": "basic-agent" + } + }, + { + "name": "kagent-token", + "projected": { + "sources": [ + { + "serviceAccountToken": { + "audience": "kagent", + "expirationSeconds": 3600, + "path": "kagent-token" + } + } + ] + } + } + ] + } + } + }, + "status": {} + }, + { + "apiVersion": "v1", + "kind": "Service", + "metadata": { + "labels": { + "app": "kagent", + "app.kubernetes.io/managed-by": "kagent", + "app.kubernetes.io/name": "basic-agent", + "app.kubernetes.io/part-of": "kagent", + "kagent": "basic-agent" + }, + "name": "basic-agent", + "namespace": "test", + "ownerReferences": [ + { + "apiVersion": "kagent.dev/v1alpha2", + "blockOwnerDeletion": true, + "controller": true, + "kind": "Agent", + "name": "basic-agent", + "uid": "" + } + ] + }, + "spec": { + "ports": [ + { + "name": "http", + "port": 8080, + "targetPort": 8080 + } + ], + "selector": { + "app": "kagent", + "kagent": "basic-agent" + }, + "type": "ClusterIP" + }, + "status": { + "loadBalancer": {} + } + } + ] +} \ No newline at end of file diff --git a/go/core/internal/controller/translator/agent/testdata/outputs/modelconfig_with_retry.json b/go/core/internal/controller/translator/agent/testdata/outputs/modelconfig_with_retry.json new file mode 100644 index 0000000000..6acedd5e26 --- /dev/null +++ b/go/core/internal/controller/translator/agent/testdata/outputs/modelconfig_with_retry.json @@ -0,0 +1,295 @@ +{ + "agentCard": { + "capabilities": { + "streaming": true + }, + "defaultInputModes": [ + "text" + ], + "defaultOutputModes": [ + "text" + ], + "description": "", + "name": "basic_agent", + "skills": null, + "supportedInterfaces": [ + { + "protocolBinding": "JSONRPC", + "protocolVersion": "0.3", + "url": "http://basic-agent.test:8080" + }, + { + "protocolBinding": "JSONRPC", + "protocolVersion": "1.0", + "url": "http://basic-agent.test:8080" + } + ], + "version": "" + }, + "config": { + "description": "", + "instruction": "You are a helpful assistant.", + "model": { + "base_url": "", + "max_retries": 5, + "model": "gpt-4o", + "type": "openai" + }, + "stream": false + }, + "manifest": [ + { + "apiVersion": "v1", + "kind": "Secret", + "metadata": { + "labels": { + "app": "kagent", + "app.kubernetes.io/managed-by": "kagent", + "app.kubernetes.io/name": "basic-agent", + "app.kubernetes.io/part-of": "kagent", + "kagent": "basic-agent" + }, + "name": "basic-agent", + "namespace": "test", + "ownerReferences": [ + { + "apiVersion": "kagent.dev/v1alpha2", + "blockOwnerDeletion": true, + "controller": true, + "kind": "Agent", + "name": "basic-agent", + "uid": "" + } + ] + }, + "stringData": { + "agent-card.json": "{\n \"defaultInputModes\": [\n \"text\"\n ],\n \"defaultOutputModes\": [\n \"text\"\n ],\n \"description\": \"\",\n \"name\": \"basic_agent\",\n \"version\": \"\",\n \"skills\": [],\n \"capabilities\": {\n \"streaming\": true\n },\n \"supportedInterfaces\": [\n {\n \"url\": \"http://basic-agent.test:8080\",\n \"protocolBinding\": \"JSONRPC\",\n \"protocolVersion\": \"0.3\"\n },\n {\n \"url\": \"http://basic-agent.test:8080\",\n \"protocolBinding\": \"JSONRPC\",\n \"protocolVersion\": \"1.0\"\n }\n ],\n \"url\": \"http://basic-agent.test:8080\",\n \"protocolVersion\": \"0.3\",\n \"preferredTransport\": \"JSONRPC\"\n}", + "config.json": "{\"model\":{\"type\":\"openai\",\"model\":\"gpt-4o\",\"max_retries\":5,\"base_url\":\"\"},\"description\":\"\",\"instruction\":\"You are a helpful assistant.\",\"stream\":false}" + } + }, + { + "apiVersion": "v1", + "kind": "ServiceAccount", + "metadata": { + "labels": { + "app": "kagent", + "app.kubernetes.io/managed-by": "kagent", + "app.kubernetes.io/name": "basic-agent", + "app.kubernetes.io/part-of": "kagent", + "kagent": "basic-agent" + }, + "name": "basic-agent", + "namespace": "test", + "ownerReferences": [ + { + "apiVersion": "kagent.dev/v1alpha2", + "blockOwnerDeletion": true, + "controller": true, + "kind": "Agent", + "name": "basic-agent", + "uid": "" + } + ] + } + }, + { + "apiVersion": "apps/v1", + "kind": "Deployment", + "metadata": { + "labels": { + "app": "kagent", + "app.kubernetes.io/managed-by": "kagent", + "app.kubernetes.io/name": "basic-agent", + "app.kubernetes.io/part-of": "kagent", + "kagent": "basic-agent" + }, + "name": "basic-agent", + "namespace": "test", + "ownerReferences": [ + { + "apiVersion": "kagent.dev/v1alpha2", + "blockOwnerDeletion": true, + "controller": true, + "kind": "Agent", + "name": "basic-agent", + "uid": "" + } + ] + }, + "spec": { + "selector": { + "matchLabels": { + "app": "kagent", + "kagent": "basic-agent" + } + }, + "strategy": { + "rollingUpdate": { + "maxSurge": 1, + "maxUnavailable": 0 + }, + "type": "RollingUpdate" + }, + "template": { + "metadata": { + "annotations": { + "kagent.dev/config-hash": "17753135717083412249" + }, + "labels": { + "app": "kagent", + "app.kubernetes.io/managed-by": "kagent", + "app.kubernetes.io/name": "basic-agent", + "app.kubernetes.io/part-of": "kagent", + "kagent": "basic-agent" + } + }, + "spec": { + "containers": [ + { + "args": [ + "--host", + "0.0.0.0", + "--port", + "8080", + "--filepath", + "/config" + ], + "env": [ + { + "name": "OPENAI_API_KEY", + "valueFrom": { + "secretKeyRef": { + "key": "api-key", + "name": "openai-secret" + } + } + }, + { + "name": "KAGENT_NAMESPACE", + "valueFrom": { + "fieldRef": { + "fieldPath": "metadata.namespace" + } + } + }, + { + "name": "KAGENT_NAME", + "value": "basic-agent" + }, + { + "name": "KAGENT_URL", + "value": "http://kagent-controller.kagent:8083" + } + ], + "image": "cr.kagent.dev/kagent-dev/kagent/app@sha256:test-app", + "imagePullPolicy": "IfNotPresent", + "name": "kagent", + "ports": [ + { + "containerPort": 8080, + "name": "http" + } + ], + "readinessProbe": { + "httpGet": { + "path": "/.well-known/agent-card.json", + "port": "http" + }, + "initialDelaySeconds": 15, + "periodSeconds": 15, + "timeoutSeconds": 15 + }, + "resources": { + "limits": { + "cpu": "2", + "memory": "1Gi" + }, + "requests": { + "cpu": "100m", + "memory": "384Mi" + } + }, + "volumeMounts": [ + { + "mountPath": "/config", + "name": "config" + }, + { + "mountPath": "/var/run/secrets/tokens", + "name": "kagent-token" + } + ] + } + ], + "serviceAccountName": "basic-agent", + "volumes": [ + { + "name": "config", + "secret": { + "secretName": "basic-agent" + } + }, + { + "name": "kagent-token", + "projected": { + "sources": [ + { + "serviceAccountToken": { + "audience": "kagent", + "expirationSeconds": 3600, + "path": "kagent-token" + } + } + ] + } + } + ] + } + } + }, + "status": {} + }, + { + "apiVersion": "v1", + "kind": "Service", + "metadata": { + "labels": { + "app": "kagent", + "app.kubernetes.io/managed-by": "kagent", + "app.kubernetes.io/name": "basic-agent", + "app.kubernetes.io/part-of": "kagent", + "kagent": "basic-agent" + }, + "name": "basic-agent", + "namespace": "test", + "ownerReferences": [ + { + "apiVersion": "kagent.dev/v1alpha2", + "blockOwnerDeletion": true, + "controller": true, + "kind": "Agent", + "name": "basic-agent", + "uid": "" + } + ] + }, + "spec": { + "ports": [ + { + "name": "http", + "port": 8080, + "targetPort": 8080 + } + ], + "selector": { + "app": "kagent", + "kagent": "basic-agent" + }, + "type": "ClusterIP" + }, + "status": { + "loadBalancer": {} + } + } + ] +} \ No newline at end of file diff --git a/helm/kagent-crds/templates/kagent.dev_agents.yaml b/helm/kagent-crds/templates/kagent.dev_agents.yaml index 8338f8e182..4d4a50483f 100644 --- a/helm/kagent-crds/templates/kagent.dev_agents.yaml +++ b/helm/kagent-crds/templates/kagent.dev_agents.yaml @@ -13110,6 +13110,35 @@ spec: maxItems: 20 type: array type: object + reliability: + description: |- + Reliability configures self-healing and observability behaviors for the + agent runtime, such as reflect-and-retry on failed tool calls, model call + caps, and debug logging. + properties: + debugLogging: + description: |- + DebugLogging enables verbose runtime logging of every LLM + request/response and tool call to the agent pod logs. + Useful for debugging agent behavior; off by default. + type: boolean + maxLLMCalls: + description: |- + MaxLLMCalls caps the total number of model calls per request (cost safety + rail). When the cap is exceeded the run stops with a clear error instead + of looping. If unset, the runtime default applies (500). + minimum: 1 + type: integer + toolRetries: + description: |- + ToolRetries is the maximum number of consecutive failures for a tool call + before the agent stops retrying it. When set, failed tool calls are followed + by structured reflection guidance injected into the model context so the + agent can self-correct instead of repeating the same failing call. + maximum: 10 + minimum: 1 + type: integer + type: object runtime: default: python description: |- diff --git a/helm/kagent-crds/templates/kagent.dev_modelconfigs.yaml b/helm/kagent-crds/templates/kagent.dev_modelconfigs.yaml index ce185907d9..966ea72a9c 100644 --- a/helm/kagent-crds/templates/kagent.dev_modelconfigs.yaml +++ b/helm/kagent-crds/templates/kagent.dev_modelconfigs.yaml @@ -628,6 +628,26 @@ spec: - Bedrock - SAPAICore type: string + retry: + description: |- + Retry configures automatic retries of failed LLM HTTP requests + (e.g. 429 rate limits, transient 5xx errors, timeouts) with + exponential backoff, handled by the provider SDK. + properties: + attempts: + description: |- + Attempts is the maximum number of retry attempts after the initial + request fails. Retried errors include rate limits (429), request + timeouts (408), and transient server errors (5xx); backoff between + attempts is exponential. Set to 0 to disable retries entirely. + Supported for the OpenAI, AzureOpenAI, Anthropic, and Gemini providers; + ignored by other providers. + maximum: 20 + minimum: 0 + type: integer + required: + - attempts + type: object sapAICore: description: SAP AI Core-specific configuration properties: diff --git a/helm/kagent-crds/templates/kagent.dev_sandboxagents.yaml b/helm/kagent-crds/templates/kagent.dev_sandboxagents.yaml index 3f8f594b50..516904f45e 100644 --- a/helm/kagent-crds/templates/kagent.dev_sandboxagents.yaml +++ b/helm/kagent-crds/templates/kagent.dev_sandboxagents.yaml @@ -10767,6 +10767,35 @@ spec: maxItems: 20 type: array type: object + reliability: + description: |- + Reliability configures self-healing and observability behaviors for the + agent runtime, such as reflect-and-retry on failed tool calls, model call + caps, and debug logging. + properties: + debugLogging: + description: |- + DebugLogging enables verbose runtime logging of every LLM + request/response and tool call to the agent pod logs. + Useful for debugging agent behavior; off by default. + type: boolean + maxLLMCalls: + description: |- + MaxLLMCalls caps the total number of model calls per request (cost safety + rail). When the cap is exceeded the run stops with a clear error instead + of looping. If unset, the runtime default applies (500). + minimum: 1 + type: integer + toolRetries: + description: |- + ToolRetries is the maximum number of consecutive failures for a tool call + before the agent stops retrying it. When set, failed tool calls are followed + by structured reflection guidance injected into the model context so the + agent can self-correct instead of repeating the same failing call. + maximum: 10 + minimum: 1 + type: integer + type: object runtime: default: python description: |- diff --git a/python/packages/kagent-adk/src/kagent/adk/_a2a.py b/python/packages/kagent-adk/src/kagent/adk/_a2a.py index 7b08e6f179..7db6080b63 100644 --- a/python/packages/kagent-adk/src/kagent/adk/_a2a.py +++ b/python/packages/kagent-adk/src/kagent/adk/_a2a.py @@ -139,7 +139,14 @@ def create_runner() -> Runner: agent_executor = A2aAgentExecutor( runner=create_runner, - config=A2aAgentExecutorConfig(stream=self.stream), + config=A2aAgentExecutorConfig( + stream=self.stream, + max_llm_calls=( + self.agent_config.reliability.max_llm_calls + if self.agent_config and self.agent_config.reliability + else None + ), + ), task_store=task_store, ) diff --git a/python/packages/kagent-adk/src/kagent/adk/_agent_executor.py b/python/packages/kagent-adk/src/kagent/adk/_agent_executor.py index 0ed10a3177..04a8cde307 100644 --- a/python/packages/kagent-adk/src/kagent/adk/_agent_executor.py +++ b/python/packages/kagent-adk/src/kagent/adk/_agent_executor.py @@ -64,6 +64,8 @@ class A2aAgentExecutorConfig(BaseModel): """Configuration for the KAgent A2aAgentExecutor.""" stream: bool = False + # Cap on the total number of model calls per request (None = ADK default). + max_llm_calls: int | None = None def _kagent_request_converter(request, _part_converter=None): @@ -209,7 +211,8 @@ async def _execute_impl( # Convert the a2a request to ADK run args stream = self._kagent_config.stream if self._kagent_config is not None else False - run_args = convert_a2a_request_to_adk_run_args(context, stream=stream) + max_llm_calls = self._kagent_config.max_llm_calls if self._kagent_config is not None else None + run_args = convert_a2a_request_to_adk_run_args(context, stream=stream, max_llm_calls=max_llm_calls) # Prepare span attributes. span_attributes = {} @@ -250,9 +253,22 @@ async def _execute_impl( except Exception as e: logger.error("Error handling A2A request: %s", e, exc_info=True) - # Check if this is a LiteLLM JSON parsing error (common with Ollama models that don't support function calling) error_message = str(e) - if ( + + # LLM call cap (reliability.maxLLMCalls) exceeded — surface a clear, + # user-facing message instead of the raw exception text. + from google.adk.agents.invocation_context import LlmCallsLimitExceededError + + if isinstance(e, LlmCallsLimitExceededError): + max_calls = self._kagent_config.max_llm_calls if self._kagent_config is not None else None + limit = f" {max_calls}" if max_calls is not None else "" + error_message = ( + f"Agent stopped: exceeded the configured limit of{limit} model calls for a single request. " + "This safety rail prevents runaway loops. If the task legitimately needs more model calls, " + "increase reliability.maxLLMCalls on the agent." + ) + # Check if this is a LiteLLM JSON parsing error (common with Ollama models that don't support function calling) + elif ( "JSONDecodeError" in error_message or "Unterminated string" in error_message or "APIConnectionError" in error_message diff --git a/python/packages/kagent-adk/src/kagent/adk/_reflect_retry_plugin.py b/python/packages/kagent-adk/src/kagent/adk/_reflect_retry_plugin.py new file mode 100644 index 0000000000..79c681ec7c --- /dev/null +++ b/python/packages/kagent-adk/src/kagent/adk/_reflect_retry_plugin.py @@ -0,0 +1,33 @@ +from typing import Any, Optional + +from google.adk.plugins import ReflectAndRetryToolPlugin +from google.adk.tools import BaseTool, ToolContext + + +class KAgentReflectAndRetryToolPlugin(ReflectAndRetryToolPlugin): + """ReflectAndRetryToolPlugin that handles both tool failure modes. + + Two ways a tool call can fail: + + 1. The tool raises an exception — handled by the base class via + ``on_tool_error_callback`` (inherited unchanged). + 2. The tool returns normally but reports an error in-band — MCP tools + never raise; they return a CallToolResult dict with ``isError: True``. + The base plugin ignores these, so we override + ``extract_error_from_result`` to treat them as failures. + + Both paths feed the same per-tool failure counter and reflect-and-retry + flow, and a successful call resets the counter. + """ + + async def extract_error_from_result( + self, + *, + tool: BaseTool, + tool_args: dict[str, Any], + tool_context: ToolContext, + result: Any, + ) -> Optional[Any]: + if isinstance(result, dict) and result.get("isError"): + return result + return None diff --git a/python/packages/kagent-adk/src/kagent/adk/cli.py b/python/packages/kagent-adk/src/kagent/adk/cli.py index e32d0aacbf..c7e37a77dc 100644 --- a/python/packages/kagent-adk/src/kagent/adk/cli.py +++ b/python/packages/kagent-adk/src/kagent/adk/cli.py @@ -78,6 +78,29 @@ def static( plugins = [] plugins.append(LLMPassthroughPlugin()) + if agent_config.reliability and agent_config.reliability.debug_logging: + from google.adk.plugins import LoggingPlugin + + if plugins is None: + plugins = [] + plugins.append(LoggingPlugin()) + logger.info("Debug logging plugin enabled") + + if agent_config.reliability and agent_config.reliability.tool_retries: + from ._reflect_retry_plugin import KAgentReflectAndRetryToolPlugin + + if plugins is None: + plugins = [] + plugins.append( + KAgentReflectAndRetryToolPlugin( + max_retries=agent_config.reliability.tool_retries, + # Return reflection guidance to the model instead of raising, + # so the conversation degrades gracefully after max retries. + throw_exception_if_retry_exceeded=False, + ) + ) + logger.info(f"Reflect-and-retry plugin enabled (tool_retries={agent_config.reliability.tool_retries})") + def root_agent_factory() -> BaseAgent: root_agent = agent_config.to_agent(app_cfg.name, sts_integration, propagate_token) diff --git a/python/packages/kagent-adk/src/kagent/adk/converters/request_converter.py b/python/packages/kagent-adk/src/kagent/adk/converters/request_converter.py index 88844daf68..587cfd67c6 100644 --- a/python/packages/kagent-adk/src/kagent/adk/converters/request_converter.py +++ b/python/packages/kagent-adk/src/kagent/adk/converters/request_converter.py @@ -20,10 +20,15 @@ def _get_user_id(request: RequestContext) -> str: def convert_a2a_request_to_adk_run_args( request: RequestContext, stream: bool = False, + max_llm_calls: int | None = None, ) -> dict[str, Any]: if not request.message: raise ValueError("Request message cannot be None") + run_config_kwargs: dict[str, Any] = {"streaming_mode": StreamingMode.SSE if stream else StreamingMode.NONE} + if max_llm_calls is not None: + run_config_kwargs["max_llm_calls"] = max_llm_calls + return { "user_id": _get_user_id(request), "session_id": request.context_id, @@ -31,5 +36,5 @@ def convert_a2a_request_to_adk_run_args( role="user", parts=[convert_a2a_part_to_genai_part(part) for part in request.message.parts], ), - "run_config": RunConfig(streaming_mode=StreamingMode.SSE if stream else StreamingMode.NONE), + "run_config": RunConfig(**run_config_kwargs), } diff --git a/python/packages/kagent-adk/src/kagent/adk/models/_anthropic.py b/python/packages/kagent-adk/src/kagent/adk/models/_anthropic.py index b8e9e68cd0..6848e2ce23 100644 --- a/python/packages/kagent-adk/src/kagent/adk/models/_anthropic.py +++ b/python/packages/kagent-adk/src/kagent/adk/models/_anthropic.py @@ -20,6 +20,10 @@ class KAgentAnthropicLlm(KAgentTLSMixin, AnthropicLlm): api_key_passthrough: Optional[bool] = None + # Max retry attempts for failed HTTP requests (429, 408, transient 5xx). + # Mapped to the Anthropic SDK's max_retries (exponential backoff). None = SDK default (2). + max_retries: Optional[int] = None + _api_key: Optional[str] = None base_url: Optional[str] = None extra_headers: Optional[dict[str, str]] = None @@ -51,6 +55,8 @@ def _anthropic_client(self) -> AsyncAnthropic: kwargs["base_url"] = self.base_url if self.extra_headers: kwargs["default_headers"] = self.extra_headers + if self.max_retries is not None: + kwargs["max_retries"] = self.max_retries # Use the httpx.AsyncClient with SSL configuration if present http_client = self._create_http_client() diff --git a/python/packages/kagent-adk/src/kagent/adk/models/_openai.py b/python/packages/kagent-adk/src/kagent/adk/models/_openai.py index ae6fd6df46..045f801313 100644 --- a/python/packages/kagent-adk/src/kagent/adk/models/_openai.py +++ b/python/packages/kagent-adk/src/kagent/adk/models/_openai.py @@ -385,6 +385,10 @@ class BaseOpenAI(KAgentTLSMixin, BaseLlm): # API key passthrough: forward the Bearer token from incoming requests as the LLM API key api_key_passthrough: Optional[bool] = None + # Max retry attempts for failed HTTP requests (429, 408, transient 5xx). + # Mapped to the OpenAI SDK's max_retries (exponential backoff). None = SDK default (2). + max_retries: Optional[int] = None + # GDCH token exchange: refreshes a short-lived bearer token before each model call. token_exchange: Optional[GDCHTokenSource] = Field(default=None, exclude=True) @@ -415,12 +419,17 @@ def _client(self) -> AsyncOpenAI: """Get the OpenAI client with optional custom SSL configuration.""" http_client = self._create_http_client() + kwargs = {} + if self.max_retries is not None: + kwargs["max_retries"] = self.max_retries + return AsyncOpenAI( api_key=self.api_key, base_url=self.base_url or None, default_headers=self.default_headers, timeout=self.timeout, http_client=http_client, + **kwargs, ) async def generate_content_async( @@ -633,10 +642,15 @@ def _client(self) -> AsyncAzureOpenAI: http_client = self._create_http_client() + kwargs = {} + if self.max_retries is not None: + kwargs["max_retries"] = self.max_retries + return AsyncAzureOpenAI( api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint, default_headers=self.default_headers, http_client=http_client, + **kwargs, ) diff --git a/python/packages/kagent-adk/src/kagent/adk/types.py b/python/packages/kagent-adk/src/kagent/adk/types.py index 889ed73e0b..4db279c55a 100644 --- a/python/packages/kagent-adk/src/kagent/adk/types.py +++ b/python/packages/kagent-adk/src/kagent/adk/types.py @@ -15,7 +15,6 @@ from kagent.adk._approval import make_approval_callback, strip_confirmation_parts_callback from kagent.adk._mcp_toolset import KAgentMcpToolset -from kagent.adk.models._ssl import create_ssl_context from kagent.adk._remote_a2a_tool import KAgentRemoteA2AToolset from kagent.adk.models._anthropic import KAgentAnthropicLlm from kagent.adk.models._bedrock import KAgentBedrockLlm @@ -23,6 +22,7 @@ from kagent.adk.models._ollama import create_ollama_llm from kagent.adk.models._openai import AzureOpenAI as OpenAIAzure from kagent.adk.models._openai import OpenAI as OpenAINative +from kagent.adk.models._ssl import create_ssl_context from kagent.adk.sandbox_code_executer import SandboxedLocalCodeExecutor from kagent.adk.tools.ask_user_tool import AskUserTool @@ -254,6 +254,10 @@ class BaseLLM(BaseModel): # API key passthrough: forward the Bearer token from incoming requests as the LLM API key api_key_passthrough: bool | None = None + # Max retry attempts for failed LLM HTTP requests (429, 408, transient 5xx), + # retried with exponential backoff by the provider SDK. None = SDK default. + max_retries: int | None = None + class GDCHTokenExchangeConfig(BaseModel): service_account_path: str @@ -361,6 +365,17 @@ class NetworkConfig(BaseModel): allowed_domains: list[str] = Field(default_factory=list) +class ReliabilityConfig(BaseModel): + """Reliability configuration for self-healing and observability behaviors.""" + + # Max consecutive failures for a tool call before the agent stops retrying it. + tool_retries: int | None = None + # Cap on the total number of model calls per request (cost safety rail). + max_llm_calls: int | None = None + # Log every LLM request/response and tool call to the agent pod logs. + debug_logging: bool | None = None + + class AgentConfig(BaseModel): model: ModelUnion = Field(discriminator="type") description: str @@ -373,6 +388,7 @@ class AgentConfig(BaseModel): memory: MemoryConfig | None = None # Memory configuration network: NetworkConfig | None = None context_config: ContextConfig | None = None + reliability: ReliabilityConfig | None = None # Self-healing: retries, call caps, debug logging def to_agent( self, name: str, sts_integration: Optional[ADKTokenPropagationPlugin] = None, propagate_token: bool = False @@ -604,6 +620,18 @@ def _create_llm_from_model_config(model_config: ModelUnion): extra_headers = model_config.headers or {} base_url = getattr(model_config, "base_url", None) + if model_config.max_retries is not None and model_config.type not in ( + "openai", + "azure_openai", + "anthropic", + "gemini", + ): + logger.warning( + "retry.attempts is not supported for model type %s; ignoring (max_retries=%d)", + model_config.type, + model_config.max_retries, + ) + if model_config.type == "openai": from .models._token_source import GDCHTokenSource @@ -640,6 +668,7 @@ def _create_llm_from_model_config(model_config: ModelUnion): temperature=model_config.temperature, timeout=model_config.timeout, top_p=model_config.top_p, + max_retries=model_config.max_retries, token_exchange=token_exchange, **_transport_kwargs(model_config), ) @@ -648,6 +677,7 @@ def _create_llm_from_model_config(model_config: ModelUnion): model=model_config.model, base_url=base_url, extra_headers=extra_headers, + max_retries=model_config.max_retries, **_transport_kwargs(model_config), ) if model_config.type == "gemini_vertex_ai": @@ -668,12 +698,20 @@ def _create_llm_from_model_config(model_config: ModelUnion): model=model_config.model, type="azure_openai", default_headers=extra_headers, + max_retries=model_config.max_retries, **_transport_kwargs(model_config), ) if model_config.type == "gemini": + retry_options = None + if model_config.max_retries is not None: + from google.genai import types as genai_types + + # HttpRetryOptions.attempts counts the initial request too. + retry_options = genai_types.HttpRetryOptions(attempts=model_config.max_retries + 1) return KAgentGeminiLlm( model=model_config.model, extra_headers=extra_headers, + retry_options=retry_options, **_transport_kwargs(model_config), ) if model_config.type == "bedrock": diff --git a/python/packages/kagent-adk/tests/unittests/models/test_sap_ai_core.py b/python/packages/kagent-adk/tests/unittests/models/test_sap_ai_core.py index aace9575a9..fab636b547 100644 --- a/python/packages/kagent-adk/tests/unittests/models/test_sap_ai_core.py +++ b/python/packages/kagent-adk/tests/unittests/models/test_sap_ai_core.py @@ -17,7 +17,6 @@ _parse_orchestration_chunk, ) - # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- diff --git a/python/packages/kagent-adk/tests/unittests/test_model_retry.py b/python/packages/kagent-adk/tests/unittests/test_model_retry.py new file mode 100644 index 0000000000..306e0cd2c5 --- /dev/null +++ b/python/packages/kagent-adk/tests/unittests/test_model_retry.py @@ -0,0 +1,74 @@ +"""Tests for ModelConfig retry (max_retries) plumbing into provider SDK clients.""" + +import json + +from kagent.adk.types import AgentConfig, _create_llm_from_model_config + + +def _make_model_config(**model_extra): + config = { + "model": {"type": "openai", "model": "gpt-4", **model_extra}, + "description": "test agent", + "instruction": "test instruction", + } + return AgentConfig.model_validate_json(json.dumps(config)).model + + +class TestMaxRetriesParsing: + def test_default_unset(self): + model = _make_model_config() + assert model.max_retries is None + + def test_max_retries_parsed(self): + model = _make_model_config(max_retries=5) + assert model.max_retries == 5 + + +class TestMaxRetriesWiring: + def test_openai_client_max_retries(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-key") + llm = _create_llm_from_model_config(_make_model_config(max_retries=5)) + assert llm.max_retries == 5 + assert llm._client.max_retries == 5 + + def test_openai_client_default_when_unset(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-key") + llm = _create_llm_from_model_config(_make_model_config()) + assert llm.max_retries is None + # OpenAI SDK default (2) applies when unset. + assert llm._client.max_retries == 2 + + def test_anthropic_client_max_retries(self, monkeypatch): + monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key") + config = { + "model": {"type": "anthropic", "model": "claude-sonnet-4-5", "max_retries": 4}, + "description": "d", + "instruction": "i", + } + llm = AgentConfig.model_validate_json(json.dumps(config)).model + llm = _create_llm_from_model_config(llm) + assert llm.max_retries == 4 + assert llm._anthropic_client.max_retries == 4 + + def test_gemini_retry_options(self): + config = { + "model": {"type": "gemini", "model": "gemini-2.0-flash", "max_retries": 3}, + "description": "d", + "instruction": "i", + } + model = AgentConfig.model_validate_json(json.dumps(config)).model + llm = _create_llm_from_model_config(model) + # HttpRetryOptions.attempts counts the initial request too. + assert llm.retry_options is not None + assert llm.retry_options.attempts == 4 + + def test_unsupported_provider_logs_warning(self, caplog): + config = { + "model": {"type": "ollama", "model": "llama3", "max_retries": 3}, + "description": "d", + "instruction": "i", + } + model = AgentConfig.model_validate_json(json.dumps(config)).model + with caplog.at_level("WARNING"): + _create_llm_from_model_config(model) + assert any("retry.attempts is not supported" in r.message for r in caplog.records) diff --git a/python/packages/kagent-adk/tests/unittests/test_reliability_config.py b/python/packages/kagent-adk/tests/unittests/test_reliability_config.py new file mode 100644 index 0000000000..3699fe219a --- /dev/null +++ b/python/packages/kagent-adk/tests/unittests/test_reliability_config.py @@ -0,0 +1,96 @@ +import json +from types import SimpleNamespace + +import pytest + +from kagent.adk._reflect_retry_plugin import KAgentReflectAndRetryToolPlugin +from kagent.adk.types import AgentConfig, ReliabilityConfig + + +def _make_agent_config(**extra) -> AgentConfig: + config = { + "model": {"type": "openai", "model": "gpt-4"}, + "description": "test agent", + "instruction": "test instruction", + } + config.update(extra) + return AgentConfig.model_validate_json(json.dumps(config)) + + +class TestReliabilityConfigParsing: + def test_defaults_unset(self): + config = _make_agent_config() + assert config.reliability is None + + def test_tool_retries(self): + config = _make_agent_config(reliability={"tool_retries": 3}) + assert config.reliability == ReliabilityConfig(tool_retries=3) + + def test_max_llm_calls(self): + config = _make_agent_config(reliability={"tool_retries": 3, "max_llm_calls": 25}) + assert config.reliability == ReliabilityConfig(tool_retries=3, max_llm_calls=25) + + def test_debug_logging(self): + config = _make_agent_config(reliability={"debug_logging": True}) + assert config.reliability is not None + assert config.reliability.debug_logging is True + + +class TestKAgentReflectAndRetryToolPlugin: + @pytest.mark.asyncio + async def test_mcp_error_result_detected(self): + plugin = KAgentReflectAndRetryToolPlugin(max_retries=2, throw_exception_if_retry_exceeded=False) + result = {"content": [{"type": "text", "text": "apply failed: exit status 1"}], "isError": True} + error = await plugin.extract_error_from_result(tool=None, tool_args={}, tool_context=None, result=result) + assert error == result + + @pytest.mark.asyncio + async def test_successful_result_not_detected(self): + plugin = KAgentReflectAndRetryToolPlugin(max_retries=2, throw_exception_if_retry_exceeded=False) + result = {"content": [{"type": "text", "text": "applied"}], "isError": False} + assert await plugin.extract_error_from_result(tool=None, tool_args={}, tool_context=None, result=result) is None + assert await plugin.extract_error_from_result(tool=None, tool_args={}, tool_context=None, result="plain") is None + + @pytest.mark.asyncio + async def test_exception_path_still_handled(self): + """Tools that raise exceptions go through the inherited on_tool_error_callback.""" + plugin = KAgentReflectAndRetryToolPlugin(max_retries=2, throw_exception_if_retry_exceeded=False) + tool = SimpleNamespace(name="kubectl_apply") + ctx = SimpleNamespace(invocation_id="inv-1") + response = await plugin.on_tool_error_callback( + tool=tool, tool_args={"manifest": "bad"}, tool_context=ctx, error=RuntimeError("boom") + ) + assert response is not None + assert "kubectl_apply" in str(response) + + @pytest.mark.asyncio + async def test_exception_and_iserror_share_failure_counter(self): + """An exception followed by an isError result counts as 2 attempts for the same tool.""" + plugin = KAgentReflectAndRetryToolPlugin(max_retries=2, throw_exception_if_retry_exceeded=False) + tool = SimpleNamespace(name="kubectl_apply") + ctx = SimpleNamespace(invocation_id="inv-1") + error_result = {"content": [{"type": "text", "text": "failed"}], "isError": True} + + # Attempt 1: exception + await plugin.on_tool_error_callback(tool=tool, tool_args={}, tool_context=ctx, error=RuntimeError("boom")) + # Attempt 2: isError result (routed via after_tool_callback) + await plugin.after_tool_callback(tool=tool, tool_args={}, tool_context=ctx, result=error_result) + # Attempt 3: exceeds max_retries=2 -> retry-exceeded guidance instead of raising + response = await plugin.after_tool_callback(tool=tool, tool_args={}, tool_context=ctx, result=error_result) + assert response is not None + assert "2" in str(response) # mentions the retry limit + + @pytest.mark.asyncio + async def test_success_resets_counter(self): + plugin = KAgentReflectAndRetryToolPlugin(max_retries=1, throw_exception_if_retry_exceeded=False) + tool = SimpleNamespace(name="kubectl_apply") + ctx = SimpleNamespace(invocation_id="inv-1") + error_result = {"isError": True} + ok_result = {"content": [{"type": "text", "text": "applied"}], "isError": False} + + await plugin.after_tool_callback(tool=tool, tool_args={}, tool_context=ctx, result=error_result) + # Success resets the per-tool counter + assert await plugin.after_tool_callback(tool=tool, tool_args={}, tool_context=ctx, result=ok_result) is None + # Next failure is attempt 1 again -> reflection guidance, not retry-exceeded + response = await plugin.after_tool_callback(tool=tool, tool_args={}, tool_context=ctx, result=error_result) + assert response is not None diff --git a/ui/src/app/actions/agents.ts b/ui/src/app/actions/agents.ts index 2ba29c4101..7a942ba83b 100644 --- a/ui/src/app/actions/agents.ts +++ b/ui/src/app/actions/agents.ts @@ -203,6 +203,14 @@ function fromAgentFormDataToAgent(agentFormData: AgentFormData): Agent { base.spec!.declarative!.context = agentFormData.context; } + if (agentFormData.toolRetries || agentFormData.maxLLMCalls || agentFormData.debugLogging) { + base.spec!.declarative!.reliability = { + toolRetries: agentFormData.toolRetries, + maxLLMCalls: agentFormData.maxLLMCalls, + debugLogging: agentFormData.debugLogging || undefined, + }; + } + const trimmedSA = agentFormData.serviceAccountName?.trim(); if (trimmedSA) { base.spec!.declarative!.deployment = { @@ -373,6 +381,14 @@ function fromAgentFormDataToSandboxAgent(agentFormData: AgentFormData): SandboxA decl.context = agentFormData.context; } + if (agentFormData.toolRetries || agentFormData.maxLLMCalls || agentFormData.debugLogging) { + decl.reliability = { + toolRetries: agentFormData.toolRetries, + maxLLMCalls: agentFormData.maxLLMCalls, + debugLogging: agentFormData.debugLogging || undefined, + }; + } + const trimmedSA = agentFormData.serviceAccountName?.trim(); if (trimmedSA) { decl.deployment = { diff --git a/ui/src/app/agents/new/page.tsx b/ui/src/app/agents/new/page.tsx index 69e567d275..16817e756f 100644 --- a/ui/src/app/agents/new/page.tsx +++ b/ui/src/app/agents/new/page.tsx @@ -1,6 +1,6 @@ "use client"; import React, { useState, useEffect, Suspense, useCallback, useMemo } from "react"; -import { Loader2 } from "lucide-react"; +import { ChevronDown, ChevronRight, Loader2 } from "lucide-react"; import { Input } from "@/components/ui/input"; import { Button } from "@/components/ui/button"; import { Textarea } from "@/components/ui/textarea"; @@ -105,6 +105,12 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo /** Python vs Go ADK (`spec.declarative.runtime`). */ declarativeRuntime: DeclarativeRuntime; contextConfig: ContextConfig | undefined; + /** Max consecutive tool-call failures before the agent stops retrying (empty = disabled). */ + toolRetries: string; + /** Cap on total model calls per request (empty = runtime default). */ + maxLLMCalls: string; + /** Log every LLM request/response and tool call to pod logs. */ + debugLogging: boolean; serviceAccountName: string; promptSourceRows: PromptSourceRow[]; isSubmitting: boolean; @@ -141,6 +147,9 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo stream: false, declarativeRuntime: "python", contextConfig: undefined, + toolRetries: "", + maxLLMCalls: "", + debugLogging: false, serviceAccountName: "", promptSourceRows: [newPromptSourceRow()], isSubmitting: false, @@ -151,6 +160,8 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo substrateSnapshotsLocation: "", }); + const [isAdvancedSectionExpanded, setIsAdvancedSectionExpanded] = useState(false); + const substrateEnabled = useSubstrateEnabled(); // When substrate becomes available, prefer it for sandbox agents still on the default platform. @@ -297,11 +308,17 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo : null, memoryTtlDays: memorySpec?.ttlDays ? String(memorySpec.ttlDays) : "", contextConfig: decl?.context, + toolRetries: decl?.reliability?.toolRetries ? String(decl.reliability.toolRetries) : "", + maxLLMCalls: decl?.reliability?.maxLLMCalls ? String(decl.reliability.maxLLMCalls) : "", + debugLogging: decl?.reliability?.debugLogging ?? false, serviceAccountName: decl?.deployment?.serviceAccountName || "", byoImage: "", byoCmd: "", byoArgs: "", })); + if (decl?.reliability?.toolRetries || decl?.reliability?.maxLLMCalls || decl?.reliability?.debugLogging) { + setIsAdvancedSectionExpanded(true); + } } else { setState((prev) => ({ ...prev, @@ -510,6 +527,11 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo } : undefined, context: useDeclarativeAgentFields ? state.contextConfig : undefined, + toolRetries: + useDeclarativeAgentFields && state.toolRetries ? parseInt(state.toolRetries, 10) : undefined, + maxLLMCalls: + useDeclarativeAgentFields && state.maxLLMCalls ? parseInt(state.maxLLMCalls, 10) : undefined, + debugLogging: useDeclarativeAgentFields ? state.debugLogging : undefined, declarativeRuntime: useDeclarativeAgentFields ? state.declarativeRuntime : undefined, byoImage: state.byoImage, byoCmd: state.byoCmd || undefined, @@ -894,6 +916,90 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo +
+ + {isAdvancedSectionExpanded && ( +
+ + Tool retries + setState((prev) => ({ ...prev, toolRetries: e.target.value }))} + disabled={disabled} + className="max-w-[180px]" + /> + + Self-healing: when a tool call fails, inject reflection guidance and retry up to this + many times before giving up. Leave empty to disable. + + + + + Max model calls per request + setState((prev) => ({ ...prev, maxLLMCalls: e.target.value }))} + disabled={disabled} + className="max-w-[180px]" + /> + + Cost safety rail: stop the run with a clear error if the agent exceeds this many model + calls in a single request (like a resource limit, but for tokens). + + + +
+
+ setState((prev) => ({ ...prev, debugLogging: !!checked }))} + disabled={disabled} + /> +
+
+ +

+ Log every LLM request/response and tool call to the agent pod logs +

+
+
+
+ )} +
+ setState((prev) => ({ ...prev, serviceAccountName: v }))} diff --git a/ui/src/app/models/new/page.tsx b/ui/src/app/models/new/page.tsx index 1adc821ee2..f0d5d6d7f5 100644 --- a/ui/src/app/models/new/page.tsx +++ b/ui/src/app/models/new/page.tsx @@ -1,7 +1,9 @@ "use client"; import React, { useState, useEffect } from "react"; import { Button } from "@/components/ui/button"; -import { Loader2 } from "lucide-react"; +import { Input } from "@/components/ui/input"; +import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; +import { Loader2, ChevronDown, ChevronRight } from "lucide-react"; import { useRouter, useSearchParams } from "next/navigation"; import { LoadingState } from "@/components/LoadingState"; import { ErrorState } from "@/components/ErrorState"; @@ -133,6 +135,8 @@ function ModelPageContent() { const [errors, setErrors] = useState({}); const [isApiKeyNeeded, setIsApiKeyNeeded] = useState(true); const [isParamsSectionExpanded, setIsParamsSectionExpanded] = useState(false); + const [isAdvancedSectionExpanded, setIsAdvancedSectionExpanded] = useState(false); + const [retryAttempts, setRetryAttempts] = useState(""); const [isFetchingModels, setIsFetchingModels] = useState(false); const [existingApiKeySecret, setExistingApiKeySecret] = useState(""); const [existingApiKeySecretKey, setExistingApiKeySecretKey] = useState(""); @@ -245,6 +249,13 @@ function ModelPageContent() { setExistingApiKeySecret(modelData.spec.apiKeySecret || ""); setExistingApiKeySecretKey(modelData.spec.apiKeySecretKey || ""); + if (modelData.spec.retry?.attempts != null) { + setRetryAttempts(String(modelData.spec.retry.attempts)); + setIsAdvancedSectionExpanded(true); + } else { + setRetryAttempts(""); + } + const spec = modelData.spec; const fetchedParams: Record = (spec.openAI ?? spec.anthropic ?? spec.azureOpenAI ?? spec.ollama ?? @@ -583,6 +594,13 @@ function ModelPageContent() { if (existingApiKeySecretKey) spec.apiKeySecretKey = existingApiKeySecretKey; } + if (retryAttempts.trim() !== "") { + const attempts = parseInt(retryAttempts, 10); + if (!isNaN(attempts) && attempts >= 0) { + spec.retry = { attempts }; + } + } + const providerType = finalSelectedProvider.type; switch (providerType) { case 'OpenAI': @@ -766,6 +784,40 @@ function ModelPageContent() { /> )} + + setIsAdvancedSectionExpanded(!isAdvancedSectionExpanded)} + > + Advanced + {isAdvancedSectionExpanded + ? + : } + + {isAdvancedSectionExpanded && ( + +
+ + setRetryAttempts(e.target.value)} + disabled={isSubmitting || isLoading} + /> +

+ Automatically retry failed model requests (rate limits, timeouts, transient server errors) with + exponential backoff. Set to 0 to disable retries. Supported for OpenAI, Azure OpenAI, Anthropic, + and Gemini. +

+
+
+ )} +
+