Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.markdown
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
#### Unreleased
* Add SolidQueue checks: `SolidQueueCheck` (liveness + job stats),
`SolidQueueBackedUpCheck` (per-queue backlog), `SolidQueueFailedJobsCheck`
(total failed jobs), `SolidQueueFailedJobsRateCheck` (rapid increase in
failures within a rolling window), and `SolidQueueScheduledBackedUpCheck`
(scheduled jobs overdue past a grace period)

#### v1.19.2
* Rename organization from `emmahsax` to `okcomputer-ruby`
> emmahsax: https://github.com/okcomputer-ruby/okcomputer/pull/25
Expand Down
16 changes: 16 additions & 0 deletions README.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,22 @@ OkComputer::Registry.register "resque_scheduler_down", OkComputer::ResqueSchedul

# If you're using SolidCache instead of Memcached, use this check instead of CacheCheck
OkComputer::Registry.register "cache", OkComputer::CacheCheckSolidCache.new

# If you're using SolidQueue, these checks monitor its health and throughput.
OkComputer::Registry.register "solid_queue", OkComputer::SolidQueueCheck.new

# Optionally, alert when a specific queue's backlog of ready jobs gets too high:
OkComputer::Registry.register "solid_queue_backed_up", OkComputer::SolidQueueBackedUpCheck.new("default", 100)

# Optionally, alert when scheduled jobs are overdue — a sign the dispatcher has
# stalled and is not promoting jobs to ready.
OkComputer::Registry.register "solid_queue_scheduled_backed_up", OkComputer::SolidQueueScheduledBackedUpCheck.new(0, grace: 2.minutes)

# Optionally, alert when too many jobs have failed in total:
OkComputer::Registry.register "solid_queue_failed_jobs", OkComputer::SolidQueueFailedJobsCheck.new(25)

# Optionally, alert on a rapid increase in failures (more than 10 failures in 300 sec)
OkComputer::Registry.register "solid_queue_failed_jobs_rate", OkComputer::SolidQueueFailedJobsRateCheck.new(10, 300)
```

### Registering Custom Checks
Expand Down
22 changes: 22 additions & 0 deletions lib/ok_computer/built_in_checks/solid_queue_backed_up_check.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
module OkComputer
class SolidQueueBackedUpCheck < SizeThresholdCheck
attr_accessor :queue
attr_accessor :threshold

# Public: Initialize a check for a backed-up SolidQueue queue
#
# queue - The name of the SolidQueue queue to check
# threshold - An Integer to compare the queue's number of ready jobs
# against to consider it backed up
def initialize(queue, threshold)
self.queue = queue
self.threshold = Integer(threshold)
self.name = "SolidQueue queue '#{queue}'"
end

# Public: The number of ready (pending) jobs in the check's queue
def size
SolidQueue::Queue.new(queue).size
end
end
end
66 changes: 66 additions & 0 deletions lib/ok_computer/built_in_checks/solid_queue_check.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
module OkComputer
# Verifies that SolidQueue is up and processing jobs by confirming that at
# least one worker process has a recent heartbeat, and reports a summary of
# the current job counts.
#
# See https://github.com/rails/solid_queue
class SolidQueueCheck < Check
# Public: Check whether SolidQueue has live workers and a live dispatcher,
# and report job stats
def check
if live_workers.zero?
mark_failure
mark_message "SolidQueue is DOWN. No workers are alive. (#{stats})"
elsif live_dispatchers.zero?
mark_failure
mark_message "SolidQueue dispatcher is DOWN. Scheduled jobs will not run. (#{stats})"
else
mark_message "SolidQueue is up (#{live_workers} worker(s), #{live_dispatchers} dispatcher(s) alive). Job Counts: #{stats}"
end
rescue => e
mark_failure
mark_message "Error: '#{e}'"
end

# Public: The number of worker processes whose heartbeat is within
# SolidQueue's configured alive threshold (default: 5 minutes)
def live_workers
alive_processes.where(kind: "Worker").count
end

# Public: The number of dispatcher processes whose heartbeat is recent enough
# to be considered alive
def live_dispatchers
alive_processes.where(kind: "Dispatcher").count
end

# Public: A summary of the current job counts across SolidQueue
def stats
"ready: #{ready}, scheduled: #{scheduled}, in progress: #{in_progress}, failed: #{failed}"
end

private

# SolidQueue::Process records that have sent a heartbeat recently enough to
# be considered alive. Mirrors SolidQueue's own Prunable logic.
def alive_processes
SolidQueue::Process.where("last_heartbeat_at > ?", SolidQueue.process_alive_threshold.ago)
end

def ready
SolidQueue::ReadyExecution.count
end

def scheduled
SolidQueue::ScheduledExecution.count
end

def in_progress
SolidQueue::ClaimedExecution.count
end

def failed
SolidQueue::FailedExecution.count
end
end
end
19 changes: 19 additions & 0 deletions lib/ok_computer/built_in_checks/solid_queue_failed_jobs_check.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
module OkComputer
class SolidQueueFailedJobsCheck < SizeThresholdCheck
attr_accessor :threshold

# Public: Initialize a check for the total number of failed SolidQueue jobs
#
# threshold - An Integer to compare the failed job count against to
# consider it over threshold
def initialize(threshold)
self.threshold = Integer(threshold)
self.name = "SolidQueue Failed Jobs"
end

# Public: The total number of failed jobs
def size
SolidQueue::FailedExecution.count
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
module OkComputer
# Detects rapid increases in failed SolidQueue jobs by counting failures
# that occurred within a rolling time window, rather than the total
# accumulated failures. This is stateless across requests: it relies on the
# created_at timestamp of each failed execution.
class SolidQueueFailedJobsRateCheck < SizeThresholdCheck
attr_accessor :threshold
attr_accessor :window

# Public: Initialize a check for the rate of failing SolidQueue jobs
#
# threshold - An Integer number of failures within the window to tolerate
# before the check is considered failed
# window - The size of the rolling window to count failures within. Accepts
# either a number of seconds or an ActiveSupport::Duration (e.g.
# 5.minutes). Defaults to 300 seconds (5 minutes).
def initialize(threshold, window = 300)
self.threshold = Integer(threshold)
self.window = window
self.name = "SolidQueue Failed Jobs Rate"
end

# Public: The number of jobs that have failed within the window
def size
cutoff = window.respond_to?(:ago) ? window.ago : Time.now - window
SolidQueue::FailedExecution.where("created_at > ?", cutoff).count
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
module OkComputer
# Detects a stalled SolidQueue dispatcher by counting scheduled jobs that are
# overdue — i.e. their scheduled_at is more than `grace` in the past, so a
# healthy dispatcher should already have promoted them to ready_executions.
#
# This is distinct from SolidQueueBackedUpCheck, which measures ready (already
# promoted) depth. A dead/behind dispatcher leaves jobs stuck in scheduled and
# invisible to that check; this check surfaces them.
class SolidQueueScheduledBackedUpCheck < SizeThresholdCheck
attr_accessor :threshold
attr_accessor :grace

# Public: Initialize a check for overdue scheduled SolidQueue jobs
#
# threshold - An Integer; the number of overdue scheduled jobs to tolerate
# before considering the dispatcher backed up.
# grace - An ActiveSupport::Duration; how far past scheduled_at a job must be
# before it counts as overdue. The dispatcher polls roughly every second
# (config/queue.yml polling_interval), so sub-poll lateness is normal and a
# grace window prevents flapping. Defaults to 1 minute.
def initialize(threshold, grace: 1.minute)
self.threshold = Integer(threshold)
self.grace = grace
self.name = "SolidQueue overdue scheduled jobs"
end

# Public: Count of scheduled jobs overdue by more than `grace`. A healthy
# dispatcher keeps this at 0.
def size
SolidQueue::ScheduledExecution.where("scheduled_at <= ?", grace.ago).count
end
end
end
5 changes: 5 additions & 0 deletions lib/okcomputer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@
require "ok_computer/built_in_checks/ruby_version_check"
require "ok_computer/built_in_checks/sequel_check"
require "ok_computer/built_in_checks/sidekiq_latency_check"
require "ok_computer/built_in_checks/solid_queue_check"
require "ok_computer/built_in_checks/solid_queue_backed_up_check"
require "ok_computer/built_in_checks/solid_queue_scheduled_backed_up_check"
require "ok_computer/built_in_checks/solid_queue_failed_jobs_check"
require "ok_computer/built_in_checks/solid_queue_failed_jobs_rate_check"
require "ok_computer/built_in_checks/solr_check"

OkComputer::Registry.register "default", OkComputer::DefaultCheck.new
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
require "rails_helper"

# Stubbing the constant out; will exist in apps which have SolidQueue loaded
module SolidQueue
class Queue; end
end

module OkComputer
describe SolidQueueBackedUpCheck do
let(:queue) { "default" }
let(:threshold) { 100 }

subject { SolidQueueBackedUpCheck.new(queue, threshold) }

it "is a Check" do
expect(subject).to be_a Check
end

context ".new(queue, threshold)" do
it "accepts a queue name and a threshold to consider backed up" do
expect(subject.queue).to eq(queue)
expect(subject.threshold).to eq(threshold)
end

it "coerces the threshold parameter into an integer" do
expect(SolidQueueBackedUpCheck.new(queue, "123").threshold).to eq(123)
end
end

context "#check" do
context "with the count less than the threshold" do
before do
allow(subject).to receive(:size) { threshold - 1 }
end

it { is_expected.to be_successful_check }
it { is_expected.to have_message "SolidQueue queue '#{queue}' at reasonable level (#{subject.size})" }
end

context "with the count equal to the threshold" do
before do
allow(subject).to receive(:size) { threshold }
end

it { is_expected.to be_successful_check }
it { is_expected.to have_message "SolidQueue queue '#{queue}' at reasonable level (#{subject.size})" }
end

context "with a count greater than the threshold" do
before do
allow(subject).to receive(:size) { threshold + 1 }
end

it { is_expected.not_to be_successful_check }
it { is_expected.to have_message "SolidQueue queue '#{queue}' is #{subject.size - subject.threshold} over threshold! (#{subject.size})" }
end
end

context "#size" do
it "defers to SolidQueue::Queue for the ready job count" do
solid_queue = double("SolidQueue::Queue", size: 42)
expect(SolidQueue::Queue).to receive(:new).with(queue).and_return(solid_queue)
expect(subject.size).to eq(42)
end
end
end
end
99 changes: 99 additions & 0 deletions spec/ok_computer/built_in_checks/solid_queue_check_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
require "rails_helper"

# Stubbing the constants out; these will exist in apps which have SolidQueue loaded
module SolidQueue
def self.process_alive_threshold
5.minutes
end

class Process; end
class ReadyExecution; end
class ScheduledExecution; end
class ClaimedExecution; end
class FailedExecution; end
end

module OkComputer
describe SolidQueueCheck do
it "is a Check" do
expect(subject).to be_a Check
end

context "#check" do
context "when workers and a dispatcher are alive" do
before do
allow(subject).to receive(:live_workers).and_return(2)
allow(subject).to receive(:live_dispatchers).and_return(1)
allow(subject).to receive(:stats).and_return("ready: 0, scheduled: 0, in progress: 0, failed: 0")
end

it { is_expected.to be_successful_check }
it { is_expected.to have_message "SolidQueue is up (2 worker(s), 1 dispatcher(s) alive)." }
it { is_expected.to have_message "Job Counts: ready: 0, scheduled: 0, in progress: 0, failed: 0" }
end

context "when no workers are alive" do
before do
allow(subject).to receive(:live_workers).and_return(0)
allow(subject).to receive(:stats).and_return("ready: 5, scheduled: 0, in progress: 0, failed: 0")
end

it { is_expected.not_to be_successful_check }
it { is_expected.to have_message "SolidQueue is DOWN. No workers are alive." }
end

context "when workers are alive but the dispatcher is down" do
before do
allow(subject).to receive(:live_workers).and_return(2)
allow(subject).to receive(:live_dispatchers).and_return(0)
allow(subject).to receive(:stats).and_return("ready: 0, scheduled: 9, in progress: 0, failed: 0")
end

it { is_expected.not_to be_successful_check }
it { is_expected.to have_message "SolidQueue dispatcher is DOWN. Scheduled jobs will not run." }
end

context "when an error occurs" do
before do
allow(subject).to receive(:live_workers).and_raise(StandardError, "boom")
end

it { is_expected.not_to be_successful_check }
it { is_expected.to have_message "Error: 'boom'" }
end
end

context "#live_workers" do
it "counts worker processes with a recent heartbeat" do
relation = double("relation")
expect(SolidQueue::Process).to receive(:where).with("last_heartbeat_at > ?", anything).and_return(relation)
expect(relation).to receive(:where).with(kind: "Worker").and_return(relation)
expect(relation).to receive(:count).and_return(3)

expect(subject.live_workers).to eq(3)
end
end

context "#live_dispatchers" do
it "counts dispatcher processes with a recent heartbeat" do
relation = double("relation")
expect(SolidQueue::Process).to receive(:where).with("last_heartbeat_at > ?", anything).and_return(relation)
expect(relation).to receive(:where).with(kind: "Dispatcher").and_return(relation)
expect(relation).to receive(:count).and_return(1)

expect(subject.live_dispatchers).to eq(1)
end
end

context "#stats" do
it "summarizes the current job counts" do
allow(SolidQueue::ReadyExecution).to receive(:count).and_return(4)
allow(SolidQueue::ScheduledExecution).to receive(:count).and_return(1)
allow(SolidQueue::ClaimedExecution).to receive(:count).and_return(2)
allow(SolidQueue::FailedExecution).to receive(:count).and_return(0)

expect(subject.stats).to eq("ready: 4, scheduled: 1, in progress: 2, failed: 0")
end
end
end
end
Loading