From 363375194e3796ff2f1899919aaef947340fdfbe Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 3 Jun 2026 01:24:44 +0000 Subject: [PATCH 1/6] Add SolidQueueCheck for monitoring SolidQueue liveness and stats Confirms SolidQueue is up by checking for worker processes with a recent heartbeat (within SolidQueue.process_alive_threshold) and reports a summary of ready, scheduled, in-progress, and failed job counts. https://claude.ai/code/session_01JkQCmoikLnhusMaKb1LepK --- .../built_in_checks/solid_queue_check.rb | 56 ++++++++++++++ lib/okcomputer.rb | 1 + .../built_in_checks/solid_queue_check_spec.rb | 76 +++++++++++++++++++ 3 files changed, 133 insertions(+) create mode 100644 lib/ok_computer/built_in_checks/solid_queue_check.rb create mode 100644 spec/ok_computer/built_in_checks/solid_queue_check_spec.rb diff --git a/lib/ok_computer/built_in_checks/solid_queue_check.rb b/lib/ok_computer/built_in_checks/solid_queue_check.rb new file mode 100644 index 00000000..17996049 --- /dev/null +++ b/lib/ok_computer/built_in_checks/solid_queue_check.rb @@ -0,0 +1,56 @@ +module OkComputer + # Verifies that SolidQueue is up and processing jobs by confirming that at + # least one worker process has a recent heartbeat, and reports a summary of + # the current job counts. + # + # See https://github.com/rails/solid_queue + class SolidQueueCheck < Check + # Public: Check whether SolidQueue has live workers and report job stats + def check + if live_workers.zero? + mark_failure + mark_message "SolidQueue is DOWN. No workers are alive. (#{stats})" + else + mark_message "SolidQueue is up (#{live_workers} worker(s) alive). Job Counts: #{stats}" + end + rescue => e + mark_failure + mark_message "Error: '#{e}'" + end + + # Public: The number of worker processes whose heartbeat is within + # SolidQueue's configured alive threshold (default: 5 minutes) + def live_workers + alive_processes.where(kind: "Worker").count + end + + # Public: A summary of the current job counts across SolidQueue + def stats + "ready: #{ready}, scheduled: #{scheduled}, in progress: #{in_progress}, failed: #{failed}" + end + + private + + # SolidQueue::Process records that have sent a heartbeat recently enough to + # be considered alive. Mirrors SolidQueue's own Prunable logic. + def alive_processes + SolidQueue::Process.where("last_heartbeat_at > ?", SolidQueue.process_alive_threshold.ago) + end + + def ready + SolidQueue::ReadyExecution.count + end + + def scheduled + SolidQueue::ScheduledExecution.count + end + + def in_progress + SolidQueue::ClaimedExecution.count + end + + def failed + SolidQueue::FailedExecution.count + end + end +end diff --git a/lib/okcomputer.rb b/lib/okcomputer.rb index dbf959dd..a65a96f0 100644 --- a/lib/okcomputer.rb +++ b/lib/okcomputer.rb @@ -34,6 +34,7 @@ require "ok_computer/built_in_checks/ruby_version_check" require "ok_computer/built_in_checks/sequel_check" require "ok_computer/built_in_checks/sidekiq_latency_check" +require "ok_computer/built_in_checks/solid_queue_check" require "ok_computer/built_in_checks/solr_check" OkComputer::Registry.register "default", OkComputer::DefaultCheck.new diff --git a/spec/ok_computer/built_in_checks/solid_queue_check_spec.rb b/spec/ok_computer/built_in_checks/solid_queue_check_spec.rb new file mode 100644 index 00000000..e9434529 --- /dev/null +++ b/spec/ok_computer/built_in_checks/solid_queue_check_spec.rb @@ -0,0 +1,76 @@ +require "rails_helper" + +# Stubbing the constants out; these will exist in apps which have SolidQueue loaded +module SolidQueue + def self.process_alive_threshold + 5.minutes + end + + class Process; end + class ReadyExecution; end + class ScheduledExecution; end + class ClaimedExecution; end + class FailedExecution; end +end + +module OkComputer + describe SolidQueueCheck do + it "is a Check" do + expect(subject).to be_a Check + end + + context "#check" do + context "when workers are alive" do + before do + allow(subject).to receive(:live_workers).and_return(2) + allow(subject).to receive(:stats).and_return("ready: 0, scheduled: 0, in progress: 0, failed: 0") + end + + it { is_expected.to be_successful_check } + it { is_expected.to have_message "SolidQueue is up (2 worker(s) alive)." } + it { is_expected.to have_message "Job Counts: ready: 0, scheduled: 0, in progress: 0, failed: 0" } + end + + context "when no workers are alive" do + before do + allow(subject).to receive(:live_workers).and_return(0) + allow(subject).to receive(:stats).and_return("ready: 5, scheduled: 0, in progress: 0, failed: 0") + end + + it { is_expected.not_to be_successful_check } + it { is_expected.to have_message "SolidQueue is DOWN. No workers are alive." } + end + + context "when an error occurs" do + before do + allow(subject).to receive(:live_workers).and_raise(StandardError, "boom") + end + + it { is_expected.not_to be_successful_check } + it { is_expected.to have_message "Error: 'boom'" } + end + end + + context "#live_workers" do + it "counts worker processes with a recent heartbeat" do + relation = double("relation") + expect(SolidQueue::Process).to receive(:where).with("last_heartbeat_at > ?", anything).and_return(relation) + expect(relation).to receive(:where).with(kind: "Worker").and_return(relation) + expect(relation).to receive(:count).and_return(3) + + expect(subject.live_workers).to eq(3) + end + end + + context "#stats" do + it "summarizes the current job counts" do + allow(SolidQueue::ReadyExecution).to receive(:count).and_return(4) + allow(SolidQueue::ScheduledExecution).to receive(:count).and_return(1) + allow(SolidQueue::ClaimedExecution).to receive(:count).and_return(2) + allow(SolidQueue::FailedExecution).to receive(:count).and_return(0) + + expect(subject.stats).to eq("ready: 4, scheduled: 1, in progress: 2, failed: 0") + end + end + end +end From 856ea59e65daeb3bad2c3f2011926d5d4a7fe3cc Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 3 Jun 2026 01:24:44 +0000 Subject: [PATCH 2/6] Add SolidQueueBackedUpCheck for per-queue backlog monitoring Fails when the number of ready (pending) jobs in a given SolidQueue queue exceeds a configurable threshold. Subclasses SizeThresholdCheck. https://claude.ai/code/session_01JkQCmoikLnhusMaKb1LepK --- .../solid_queue_backed_up_check.rb | 22 ++++++ lib/okcomputer.rb | 1 + .../solid_queue_backed_up_check_spec.rb | 67 +++++++++++++++++++ 3 files changed, 90 insertions(+) create mode 100644 lib/ok_computer/built_in_checks/solid_queue_backed_up_check.rb create mode 100644 spec/ok_computer/built_in_checks/solid_queue_backed_up_check_spec.rb diff --git a/lib/ok_computer/built_in_checks/solid_queue_backed_up_check.rb b/lib/ok_computer/built_in_checks/solid_queue_backed_up_check.rb new file mode 100644 index 00000000..a674561a --- /dev/null +++ b/lib/ok_computer/built_in_checks/solid_queue_backed_up_check.rb @@ -0,0 +1,22 @@ +module OkComputer + class SolidQueueBackedUpCheck < SizeThresholdCheck + attr_accessor :queue + attr_accessor :threshold + + # Public: Initialize a check for a backed-up SolidQueue queue + # + # queue - The name of the SolidQueue queue to check + # threshold - An Integer to compare the queue's number of ready jobs + # against to consider it backed up + def initialize(queue, threshold) + self.queue = queue + self.threshold = Integer(threshold) + self.name = "SolidQueue queue '#{queue}'" + end + + # Public: The number of ready (pending) jobs in the check's queue + def size + SolidQueue::Queue.new(queue).size + end + end +end diff --git a/lib/okcomputer.rb b/lib/okcomputer.rb index a65a96f0..5d1fdb25 100644 --- a/lib/okcomputer.rb +++ b/lib/okcomputer.rb @@ -35,6 +35,7 @@ require "ok_computer/built_in_checks/sequel_check" require "ok_computer/built_in_checks/sidekiq_latency_check" require "ok_computer/built_in_checks/solid_queue_check" +require "ok_computer/built_in_checks/solid_queue_backed_up_check" require "ok_computer/built_in_checks/solr_check" OkComputer::Registry.register "default", OkComputer::DefaultCheck.new diff --git a/spec/ok_computer/built_in_checks/solid_queue_backed_up_check_spec.rb b/spec/ok_computer/built_in_checks/solid_queue_backed_up_check_spec.rb new file mode 100644 index 00000000..ddfce514 --- /dev/null +++ b/spec/ok_computer/built_in_checks/solid_queue_backed_up_check_spec.rb @@ -0,0 +1,67 @@ +require "rails_helper" + +# Stubbing the constant out; will exist in apps which have SolidQueue loaded +module SolidQueue + class Queue; end +end + +module OkComputer + describe SolidQueueBackedUpCheck do + let(:queue) { "default" } + let(:threshold) { 100 } + + subject { SolidQueueBackedUpCheck.new(queue, threshold) } + + it "is a Check" do + expect(subject).to be_a Check + end + + context ".new(queue, threshold)" do + it "accepts a queue name and a threshold to consider backed up" do + expect(subject.queue).to eq(queue) + expect(subject.threshold).to eq(threshold) + end + + it "coerces the threshold parameter into an integer" do + expect(SolidQueueBackedUpCheck.new(queue, "123").threshold).to eq(123) + end + end + + context "#check" do + context "with the count less than the threshold" do + before do + allow(subject).to receive(:size) { threshold - 1 } + end + + it { is_expected.to be_successful_check } + it { is_expected.to have_message "SolidQueue queue '#{queue}' at reasonable level (#{subject.size})" } + end + + context "with the count equal to the threshold" do + before do + allow(subject).to receive(:size) { threshold } + end + + it { is_expected.to be_successful_check } + it { is_expected.to have_message "SolidQueue queue '#{queue}' at reasonable level (#{subject.size})" } + end + + context "with a count greater than the threshold" do + before do + allow(subject).to receive(:size) { threshold + 1 } + end + + it { is_expected.not_to be_successful_check } + it { is_expected.to have_message "SolidQueue queue '#{queue}' is #{subject.size - subject.threshold} over threshold! (#{subject.size})" } + end + end + + context "#size" do + it "defers to SolidQueue::Queue for the ready job count" do + solid_queue = double("SolidQueue::Queue", size: 42) + expect(SolidQueue::Queue).to receive(:new).with(queue).and_return(solid_queue) + expect(subject.size).to eq(42) + end + end + end +end From 1439656be9beec645f67c7fde9d9e5cd4be49c72 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 3 Jun 2026 01:24:44 +0000 Subject: [PATCH 3/6] Add SolidQueueFailedJobsCheck for total failed job count Fails when the cumulative number of SolidQueue failed executions exceeds a configurable threshold. Subclasses SizeThresholdCheck. https://claude.ai/code/session_01JkQCmoikLnhusMaKb1LepK --- .../solid_queue_failed_jobs_check.rb | 19 ++++++ lib/okcomputer.rb | 1 + .../solid_queue_failed_jobs_check_spec.rb | 64 +++++++++++++++++++ 3 files changed, 84 insertions(+) create mode 100644 lib/ok_computer/built_in_checks/solid_queue_failed_jobs_check.rb create mode 100644 spec/ok_computer/built_in_checks/solid_queue_failed_jobs_check_spec.rb diff --git a/lib/ok_computer/built_in_checks/solid_queue_failed_jobs_check.rb b/lib/ok_computer/built_in_checks/solid_queue_failed_jobs_check.rb new file mode 100644 index 00000000..aa14a4f9 --- /dev/null +++ b/lib/ok_computer/built_in_checks/solid_queue_failed_jobs_check.rb @@ -0,0 +1,19 @@ +module OkComputer + class SolidQueueFailedJobsCheck < SizeThresholdCheck + attr_accessor :threshold + + # Public: Initialize a check for the total number of failed SolidQueue jobs + # + # threshold - An Integer to compare the failed job count against to + # consider it over threshold + def initialize(threshold) + self.threshold = Integer(threshold) + self.name = "SolidQueue Failed Jobs" + end + + # Public: The total number of failed jobs + def size + SolidQueue::FailedExecution.count + end + end +end diff --git a/lib/okcomputer.rb b/lib/okcomputer.rb index 5d1fdb25..16b500b3 100644 --- a/lib/okcomputer.rb +++ b/lib/okcomputer.rb @@ -36,6 +36,7 @@ require "ok_computer/built_in_checks/sidekiq_latency_check" require "ok_computer/built_in_checks/solid_queue_check" require "ok_computer/built_in_checks/solid_queue_backed_up_check" +require "ok_computer/built_in_checks/solid_queue_failed_jobs_check" require "ok_computer/built_in_checks/solr_check" OkComputer::Registry.register "default", OkComputer::DefaultCheck.new diff --git a/spec/ok_computer/built_in_checks/solid_queue_failed_jobs_check_spec.rb b/spec/ok_computer/built_in_checks/solid_queue_failed_jobs_check_spec.rb new file mode 100644 index 00000000..73111fe1 --- /dev/null +++ b/spec/ok_computer/built_in_checks/solid_queue_failed_jobs_check_spec.rb @@ -0,0 +1,64 @@ +require "rails_helper" + +# Stubbing the constant out; will exist in apps which have SolidQueue loaded +module SolidQueue + class FailedExecution; end +end + +module OkComputer + describe SolidQueueFailedJobsCheck do + let(:threshold) { 100 } + + subject { SolidQueueFailedJobsCheck.new(threshold) } + + it "is a Check" do + expect(subject).to be_a Check + end + + context ".new(threshold)" do + it "accepts a threshold to consider over the limit" do + expect(subject.threshold).to eq(threshold) + end + + it "coerces the threshold parameter into an integer" do + expect(SolidQueueFailedJobsCheck.new("123").threshold).to eq(123) + end + end + + context "#check" do + context "with the count less than the threshold" do + before do + allow(subject).to receive(:size) { threshold - 1 } + end + + it { is_expected.to be_successful_check } + it { is_expected.to have_message "SolidQueue Failed Jobs at reasonable level (#{subject.size})" } + end + + context "with the count equal to the threshold" do + before do + allow(subject).to receive(:size) { threshold } + end + + it { is_expected.to be_successful_check } + it { is_expected.to have_message "SolidQueue Failed Jobs at reasonable level (#{subject.size})" } + end + + context "with a count greater than the threshold" do + before do + allow(subject).to receive(:size) { threshold + 1 } + end + + it { is_expected.not_to be_successful_check } + it { is_expected.to have_message "SolidQueue Failed Jobs is #{subject.size - subject.threshold} over threshold! (#{subject.size})" } + end + end + + context "#size" do + it "defers to SolidQueue for the failed job count" do + expect(SolidQueue::FailedExecution).to receive(:count) { 123 } + expect(subject.size).to eq(123) + end + end + end +end From de35386d42c852b37b0ad3d0351833e2c1ef5869 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 3 Jun 2026 01:24:45 +0000 Subject: [PATCH 4/6] Add SolidQueueFailedJobsRateCheck to detect rapid failure increases Counts failed executions created within a rolling time window (default 5 minutes) and fails when more than the threshold have failed recently. This is stateless across requests, relying on each failed execution's created_at rather than a stored snapshot. Subclasses SizeThresholdCheck. https://claude.ai/code/session_01JkQCmoikLnhusMaKb1LepK --- .../solid_queue_failed_jobs_rate_check.rb | 29 +++++++ lib/okcomputer.rb | 1 + ...solid_queue_failed_jobs_rate_check_spec.rb | 77 +++++++++++++++++++ 3 files changed, 107 insertions(+) create mode 100644 lib/ok_computer/built_in_checks/solid_queue_failed_jobs_rate_check.rb create mode 100644 spec/ok_computer/built_in_checks/solid_queue_failed_jobs_rate_check_spec.rb diff --git a/lib/ok_computer/built_in_checks/solid_queue_failed_jobs_rate_check.rb b/lib/ok_computer/built_in_checks/solid_queue_failed_jobs_rate_check.rb new file mode 100644 index 00000000..f840058e --- /dev/null +++ b/lib/ok_computer/built_in_checks/solid_queue_failed_jobs_rate_check.rb @@ -0,0 +1,29 @@ +module OkComputer + # Detects rapid increases in failed SolidQueue jobs by counting failures + # that occurred within a rolling time window, rather than the total + # accumulated failures. This is stateless across requests: it relies on the + # created_at timestamp of each failed execution. + class SolidQueueFailedJobsRateCheck < SizeThresholdCheck + attr_accessor :threshold + attr_accessor :window + + # Public: Initialize a check for the rate of failing SolidQueue jobs + # + # threshold - An Integer number of failures within the window to tolerate + # before the check is considered failed + # window - The size of the rolling window to count failures within. Accepts + # either a number of seconds or an ActiveSupport::Duration (e.g. + # 5.minutes). Defaults to 300 seconds (5 minutes). + def initialize(threshold, window = 300) + self.threshold = Integer(threshold) + self.window = window + self.name = "SolidQueue Failed Jobs Rate" + end + + # Public: The number of jobs that have failed within the window + def size + cutoff = window.respond_to?(:ago) ? window.ago : Time.now - window + SolidQueue::FailedExecution.where("created_at > ?", cutoff).count + end + end +end diff --git a/lib/okcomputer.rb b/lib/okcomputer.rb index 16b500b3..888ad813 100644 --- a/lib/okcomputer.rb +++ b/lib/okcomputer.rb @@ -37,6 +37,7 @@ require "ok_computer/built_in_checks/solid_queue_check" require "ok_computer/built_in_checks/solid_queue_backed_up_check" require "ok_computer/built_in_checks/solid_queue_failed_jobs_check" +require "ok_computer/built_in_checks/solid_queue_failed_jobs_rate_check" require "ok_computer/built_in_checks/solr_check" OkComputer::Registry.register "default", OkComputer::DefaultCheck.new diff --git a/spec/ok_computer/built_in_checks/solid_queue_failed_jobs_rate_check_spec.rb b/spec/ok_computer/built_in_checks/solid_queue_failed_jobs_rate_check_spec.rb new file mode 100644 index 00000000..e8324b43 --- /dev/null +++ b/spec/ok_computer/built_in_checks/solid_queue_failed_jobs_rate_check_spec.rb @@ -0,0 +1,77 @@ +require "rails_helper" + +# Stubbing the constant out; will exist in apps which have SolidQueue loaded +module SolidQueue + class FailedExecution; end +end + +module OkComputer + describe SolidQueueFailedJobsRateCheck do + let(:threshold) { 10 } + + subject { SolidQueueFailedJobsRateCheck.new(threshold) } + + it "is a Check" do + expect(subject).to be_a Check + end + + context ".new(threshold, window)" do + it "accepts a threshold and defaults the window to 300 seconds" do + expect(subject.threshold).to eq(threshold) + expect(subject.window).to eq(300) + end + + it "accepts a custom window" do + expect(SolidQueueFailedJobsRateCheck.new(threshold, 60).window).to eq(60) + end + + it "coerces the threshold parameter into an integer" do + expect(SolidQueueFailedJobsRateCheck.new("123").threshold).to eq(123) + end + end + + context "#check" do + context "with the count less than the threshold" do + before do + allow(subject).to receive(:size) { threshold - 1 } + end + + it { is_expected.to be_successful_check } + it { is_expected.to have_message "SolidQueue Failed Jobs Rate at reasonable level (#{subject.size})" } + end + + context "with a count greater than the threshold" do + before do + allow(subject).to receive(:size) { threshold + 1 } + end + + it { is_expected.not_to be_successful_check } + it { is_expected.to have_message "SolidQueue Failed Jobs Rate is #{subject.size - subject.threshold} over threshold! (#{subject.size})" } + end + end + + context "#size" do + it "counts failed executions created within the window" do + relation = double("relation", count: 7) + expect(SolidQueue::FailedExecution).to receive(:where).with("created_at > ?", anything).and_return(relation) + expect(subject.size).to eq(7) + end + + it "uses a number of seconds to compute the cutoff" do + now = Time.now + allow(Time).to receive(:now).and_return(now) + relation = double("relation", count: 0) + check = SolidQueueFailedJobsRateCheck.new(threshold, 120) + expect(SolidQueue::FailedExecution).to receive(:where).with("created_at > ?", now - 120).and_return(relation) + check.size + end + + it "accepts an ActiveSupport::Duration window" do + relation = double("relation", count: 0) + check = SolidQueueFailedJobsRateCheck.new(threshold, 5.minutes) + expect(SolidQueue::FailedExecution).to receive(:where).with("created_at > ?", anything).and_return(relation) + check.size + end + end + end +end From 56a66bf87246b56a9d9f44fd304eabf18bb4294b Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 4 Jun 2026 18:16:41 +0000 Subject: [PATCH 5/6] Add SolidQueueScheduledBackedUpCheck to detect a stalled dispatcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Counts scheduled jobs whose scheduled_at is more than a grace period in the past — jobs a healthy dispatcher should already have promoted to ready. This surfaces a dead, wedged, or merely slow dispatcher, which SolidQueueBackedUpCheck (ready depth only) cannot see. Subclasses SizeThresholdCheck. https://claude.ai/code/session_01JkQCmoikLnhusMaKb1LepK --- .../built_in_checks/solid_queue_check.rb | 14 ++++- .../solid_queue_scheduled_backed_up_check.rb | 33 +++++++++++ lib/okcomputer.rb | 1 + .../built_in_checks/solid_queue_check_spec.rb | 27 ++++++++- ...id_queue_scheduled_backed_up_check_spec.rb | 58 +++++++++++++++++++ 5 files changed, 129 insertions(+), 4 deletions(-) create mode 100644 lib/ok_computer/built_in_checks/solid_queue_scheduled_backed_up_check.rb create mode 100644 spec/ok_computer/built_in_checks/solid_queue_scheduled_backed_up_check_spec.rb diff --git a/lib/ok_computer/built_in_checks/solid_queue_check.rb b/lib/ok_computer/built_in_checks/solid_queue_check.rb index 17996049..62418237 100644 --- a/lib/ok_computer/built_in_checks/solid_queue_check.rb +++ b/lib/ok_computer/built_in_checks/solid_queue_check.rb @@ -5,13 +5,17 @@ module OkComputer # # See https://github.com/rails/solid_queue class SolidQueueCheck < Check - # Public: Check whether SolidQueue has live workers and report job stats + # Public: Check whether SolidQueue has live workers and a live dispatcher, + # and report job stats def check if live_workers.zero? mark_failure mark_message "SolidQueue is DOWN. No workers are alive. (#{stats})" + elsif live_dispatchers.zero? + mark_failure + mark_message "SolidQueue dispatcher is DOWN. Scheduled jobs will not run. (#{stats})" else - mark_message "SolidQueue is up (#{live_workers} worker(s) alive). Job Counts: #{stats}" + mark_message "SolidQueue is up (#{live_workers} worker(s), #{live_dispatchers} dispatcher(s) alive). Job Counts: #{stats}" end rescue => e mark_failure @@ -24,6 +28,12 @@ def live_workers alive_processes.where(kind: "Worker").count end + # Public: The number of dispatcher processes whose heartbeat is recent enough + # to be considered alive + def live_dispatchers + alive_processes.where(kind: "Dispatcher").count + end + # Public: A summary of the current job counts across SolidQueue def stats "ready: #{ready}, scheduled: #{scheduled}, in progress: #{in_progress}, failed: #{failed}" diff --git a/lib/ok_computer/built_in_checks/solid_queue_scheduled_backed_up_check.rb b/lib/ok_computer/built_in_checks/solid_queue_scheduled_backed_up_check.rb new file mode 100644 index 00000000..6be0f50f --- /dev/null +++ b/lib/ok_computer/built_in_checks/solid_queue_scheduled_backed_up_check.rb @@ -0,0 +1,33 @@ +module OkComputer + # Detects a stalled SolidQueue dispatcher by counting scheduled jobs that are + # overdue — i.e. their scheduled_at is more than `grace` in the past, so a + # healthy dispatcher should already have promoted them to ready_executions. + # + # This is distinct from SolidQueueBackedUpCheck, which measures ready (already + # promoted) depth. A dead/behind dispatcher leaves jobs stuck in scheduled and + # invisible to that check; this check surfaces them. + class SolidQueueScheduledBackedUpCheck < SizeThresholdCheck + attr_accessor :threshold + attr_accessor :grace + + # Public: Initialize a check for overdue scheduled SolidQueue jobs + # + # threshold - An Integer; the number of overdue scheduled jobs to tolerate + # before considering the dispatcher backed up. + # grace - An ActiveSupport::Duration; how far past scheduled_at a job must be + # before it counts as overdue. The dispatcher polls roughly every second + # (config/queue.yml polling_interval), so sub-poll lateness is normal and a + # grace window prevents flapping. Defaults to 1 minute. + def initialize(threshold, grace: 1.minute) + self.threshold = Integer(threshold) + self.grace = grace + self.name = "SolidQueue overdue scheduled jobs" + end + + # Public: Count of scheduled jobs overdue by more than `grace`. A healthy + # dispatcher keeps this at 0. + def size + SolidQueue::ScheduledExecution.where("scheduled_at <= ?", grace.ago).count + end + end +end diff --git a/lib/okcomputer.rb b/lib/okcomputer.rb index 888ad813..de57f618 100644 --- a/lib/okcomputer.rb +++ b/lib/okcomputer.rb @@ -36,6 +36,7 @@ require "ok_computer/built_in_checks/sidekiq_latency_check" require "ok_computer/built_in_checks/solid_queue_check" require "ok_computer/built_in_checks/solid_queue_backed_up_check" +require "ok_computer/built_in_checks/solid_queue_scheduled_backed_up_check" require "ok_computer/built_in_checks/solid_queue_failed_jobs_check" require "ok_computer/built_in_checks/solid_queue_failed_jobs_rate_check" require "ok_computer/built_in_checks/solr_check" diff --git a/spec/ok_computer/built_in_checks/solid_queue_check_spec.rb b/spec/ok_computer/built_in_checks/solid_queue_check_spec.rb index e9434529..8eda4bbd 100644 --- a/spec/ok_computer/built_in_checks/solid_queue_check_spec.rb +++ b/spec/ok_computer/built_in_checks/solid_queue_check_spec.rb @@ -20,14 +20,15 @@ module OkComputer end context "#check" do - context "when workers are alive" do + context "when workers and a dispatcher are alive" do before do allow(subject).to receive(:live_workers).and_return(2) + allow(subject).to receive(:live_dispatchers).and_return(1) allow(subject).to receive(:stats).and_return("ready: 0, scheduled: 0, in progress: 0, failed: 0") end it { is_expected.to be_successful_check } - it { is_expected.to have_message "SolidQueue is up (2 worker(s) alive)." } + it { is_expected.to have_message "SolidQueue is up (2 worker(s), 1 dispatcher(s) alive)." } it { is_expected.to have_message "Job Counts: ready: 0, scheduled: 0, in progress: 0, failed: 0" } end @@ -41,6 +42,17 @@ module OkComputer it { is_expected.to have_message "SolidQueue is DOWN. No workers are alive." } end + context "when workers are alive but the dispatcher is down" do + before do + allow(subject).to receive(:live_workers).and_return(2) + allow(subject).to receive(:live_dispatchers).and_return(0) + allow(subject).to receive(:stats).and_return("ready: 0, scheduled: 9, in progress: 0, failed: 0") + end + + it { is_expected.not_to be_successful_check } + it { is_expected.to have_message "SolidQueue dispatcher is DOWN. Scheduled jobs will not run." } + end + context "when an error occurs" do before do allow(subject).to receive(:live_workers).and_raise(StandardError, "boom") @@ -62,6 +74,17 @@ module OkComputer end end + context "#live_dispatchers" do + it "counts dispatcher processes with a recent heartbeat" do + relation = double("relation") + expect(SolidQueue::Process).to receive(:where).with("last_heartbeat_at > ?", anything).and_return(relation) + expect(relation).to receive(:where).with(kind: "Dispatcher").and_return(relation) + expect(relation).to receive(:count).and_return(1) + + expect(subject.live_dispatchers).to eq(1) + end + end + context "#stats" do it "summarizes the current job counts" do allow(SolidQueue::ReadyExecution).to receive(:count).and_return(4) diff --git a/spec/ok_computer/built_in_checks/solid_queue_scheduled_backed_up_check_spec.rb b/spec/ok_computer/built_in_checks/solid_queue_scheduled_backed_up_check_spec.rb new file mode 100644 index 00000000..59d60f97 --- /dev/null +++ b/spec/ok_computer/built_in_checks/solid_queue_scheduled_backed_up_check_spec.rb @@ -0,0 +1,58 @@ +require "rails_helper" + +# Stubbing the constant out; will exist in apps which have SolidQueue loaded +module SolidQueue + class ScheduledExecution; end +end + +module OkComputer + describe SolidQueueScheduledBackedUpCheck do + let(:threshold) { 0 } + + subject { SolidQueueScheduledBackedUpCheck.new(threshold) } + + it "is a Check" do + expect(subject).to be_a Check + end + + context ".new(threshold, grace:)" do + it "accepts a threshold and defaults the grace period to 1 minute" do + expect(subject.threshold).to eq(threshold) + expect(subject.grace).to eq(1.minute) + end + + it "coerces the threshold parameter into an integer" do + expect(SolidQueueScheduledBackedUpCheck.new("5").threshold).to eq(5) + end + + it "accepts a custom grace period" do + expect(SolidQueueScheduledBackedUpCheck.new(0, grace: 10.minutes).grace).to eq(10.minutes) + end + end + + context "#check" do + context "with the count less than or equal to the threshold" do + before { allow(subject).to receive(:size) { threshold } } + + it { is_expected.to be_successful_check } + it { is_expected.to have_message "SolidQueue overdue scheduled jobs at reasonable level (#{subject.size})" } + end + + context "with a count greater than the threshold" do + before { allow(subject).to receive(:size) { threshold + 3 } } + + it { is_expected.not_to be_successful_check } + it { is_expected.to have_message "SolidQueue overdue scheduled jobs is #{subject.size - subject.threshold} over threshold! (#{subject.size})" } + end + end + + context "#size" do + it "counts scheduled jobs overdue by more than the grace period" do + relation = double("relation", count: 7) + expect(SolidQueue::ScheduledExecution).to receive(:where).with("scheduled_at <= ?", anything).and_return(relation) + + expect(subject.size).to eq(7) + end + end + end +end From 166529866c03b12b4f6ae7a6224a36f0519f2110 Mon Sep 17 00:00:00 2001 From: Steven Chanin Date: Thu, 4 Jun 2026 11:31:19 -0700 Subject: [PATCH 6/6] Document changes --- CHANGELOG.markdown | 7 +++++++ README.markdown | 16 ++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/CHANGELOG.markdown b/CHANGELOG.markdown index fe783312..2fa13222 100644 --- a/CHANGELOG.markdown +++ b/CHANGELOG.markdown @@ -1,3 +1,10 @@ +#### Unreleased +* Add SolidQueue checks: `SolidQueueCheck` (liveness + job stats), + `SolidQueueBackedUpCheck` (per-queue backlog), `SolidQueueFailedJobsCheck` + (total failed jobs), `SolidQueueFailedJobsRateCheck` (rapid increase in + failures within a rolling window), and `SolidQueueScheduledBackedUpCheck` + (scheduled jobs overdue past a grace period) + #### v1.19.2 * Rename organization from `emmahsax` to `okcomputer-ruby` > emmahsax: https://github.com/okcomputer-ruby/okcomputer/pull/25 diff --git a/README.markdown b/README.markdown index a2e5529e..6ea05b1b 100644 --- a/README.markdown +++ b/README.markdown @@ -141,6 +141,22 @@ OkComputer::Registry.register "resque_scheduler_down", OkComputer::ResqueSchedul # If you're using SolidCache instead of Memcached, use this check instead of CacheCheck OkComputer::Registry.register "cache", OkComputer::CacheCheckSolidCache.new + +# If you're using SolidQueue, these checks monitor its health and throughput. +OkComputer::Registry.register "solid_queue", OkComputer::SolidQueueCheck.new + +# Optionally, alert when a specific queue's backlog of ready jobs gets too high: +OkComputer::Registry.register "solid_queue_backed_up", OkComputer::SolidQueueBackedUpCheck.new("default", 100) + +# Optionally, alert when scheduled jobs are overdue — a sign the dispatcher has +# stalled and is not promoting jobs to ready. +OkComputer::Registry.register "solid_queue_scheduled_backed_up", OkComputer::SolidQueueScheduledBackedUpCheck.new(0, grace: 2.minutes) + +# Optionally, alert when too many jobs have failed in total: +OkComputer::Registry.register "solid_queue_failed_jobs", OkComputer::SolidQueueFailedJobsCheck.new(25) + +# Optionally, alert on a rapid increase in failures (more than 10 failures in 300 sec) +OkComputer::Registry.register "solid_queue_failed_jobs_rate", OkComputer::SolidQueueFailedJobsRateCheck.new(10, 300) ``` ### Registering Custom Checks