From 53c68e91d9fe63109226affb609c69816e4eecaa Mon Sep 17 00:00:00 2001 From: Rosa Gutierrez Date: Tue, 20 Feb 2024 10:03:25 +0100 Subject: [PATCH] Include job_id in order clause when fetching scheduled jobs to dispatch and when dispatching This was missing and makes the locking prone to deadlocks when we have a bunch of jobs scheduled at the same time with the same priority, since we'd have a non-deterministic ordering. Besides, on the DELETE query, force a order by job_id even if we don't care about that. It turns out, under certain circumstances, when the scheduled_executions table is too small, MySQL might choose not to use any indexes for the DELETE by job_id. This means it might lock other rows besides those it's going to delete. Then, when running more than one dispatcher doing this, we might end up with a deadlock because one dispatcher is deleting and locking some rows that's not deleting, and the other is doing the same, and both are waiting for the other's lock. This deadlock was happening consistently in CI, for MySQL only, but I didn't manage to reproduce it locally, and it has never happened in production for us. Using an INDEX hint in the DELETE query, to ensure the index on job_id was used, also avoided the deadlock. --- app/models/solid_queue/execution/dispatching.rb | 2 +- app/models/solid_queue/scheduled_execution.rb | 2 +- test/unit/dispatcher_test.rb | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/app/models/solid_queue/execution/dispatching.rb b/app/models/solid_queue/execution/dispatching.rb index 9cfcf25c..4f6bb60e 100644 --- a/app/models/solid_queue/execution/dispatching.rb +++ b/app/models/solid_queue/execution/dispatching.rb @@ -10,7 +10,7 @@ def dispatch_jobs(job_ids) jobs = Job.where(id: job_ids) Job.dispatch_all(jobs).map(&:id).tap do |dispatched_job_ids| - where(job_id: dispatched_job_ids).delete_all + where(job_id: dispatched_job_ids).order(:job_id).delete_all SolidQueue.logger.info("[SolidQueue] Dispatched #{dispatched_job_ids.size} jobs") end end diff --git a/app/models/solid_queue/scheduled_execution.rb b/app/models/solid_queue/scheduled_execution.rb index 5d50b287..f74d68ff 100644 --- a/app/models/solid_queue/scheduled_execution.rb +++ b/app/models/solid_queue/scheduled_execution.rb @@ -5,7 +5,7 @@ class ScheduledExecution < Execution include Dispatching scope :due, -> { where(scheduled_at: ..Time.current) } - scope :ordered, -> { order(scheduled_at: :asc, priority: :asc) } + scope :ordered, -> { order(scheduled_at: :asc, priority: :asc, job_id: :asc) } scope :next_batch, ->(batch_size) { due.ordered.limit(batch_size) } assumes_attributes_from_job :scheduled_at diff --git a/test/unit/dispatcher_test.rb b/test/unit/dispatcher_test.rb index 69b5fa98..52ebfa7a 100644 --- a/test/unit/dispatcher_test.rb +++ b/test/unit/dispatcher_test.rb @@ -4,6 +4,8 @@ class DispatcherTest < ActiveSupport::TestCase include ActiveSupport::Testing::MethodCallAssertions + self.use_transactional_tests = false + setup do @dispatcher = SolidQueue::Dispatcher.new(polling_interval: 0.1, batch_size: 10) end @@ -56,6 +58,7 @@ class DispatcherTest < ActiveSupport::TestCase assert_equal 15, SolidQueue::ScheduledExecution.count another_dispatcher = SolidQueue::Dispatcher.new(polling_interval: 0.1, batch_size: 10) + @dispatcher.start another_dispatcher.start