From 63b7c09bb5fcd5eb60c26ce3c53cccf174bf445b Mon Sep 17 00:00:00 2001 From: Matt Amos Date: Tue, 7 Jul 2020 19:42:31 +0100 Subject: [PATCH 1/1] Attempt to fix changeset replication lock queueing. When the changeset replication runs slow, a lot of processes queue up trying to get the lock. I'm not totally sure, but it looks like they then get woken up in a random order, resulting in weird out-of-order behaviour. This patch simplifies that process in two ways: 1. If the lock isn't acquired, the process exits. This means much less (perhaps zero) lock queueing. 2. Using a separate `lockfile`, rather than the current state file. Since the new state file is moved into place over the old one, it was effectively unlocking at that point for _new_ processes. But would also unblock an old process which still had the old (now unlinked) file descriptor open. Hopefully between these two changes, it resolves some of the brokenness that plagues changeset replication! :-) --- .../files/default/replication-bin/replicate-changesets | 9 +++++++-- cookbooks/planet/recipes/replication.rb | 6 ++++++ cookbooks/planet/templates/default/changesets.conf.erb | 1 + 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/cookbooks/planet/files/default/replication-bin/replicate-changesets b/cookbooks/planet/files/default/replication-bin/replicate-changesets index 29131b2ed..77dc83312 100755 --- a/cookbooks/planet/files/default/replication-bin/replicate-changesets +++ b/cookbooks/planet/files/default/replication-bin/replicate-changesets @@ -261,8 +261,13 @@ class Replicator # saves new state (including the changeset dump xml) def save! - File.open(@config["state_file"], "r") do |fl| - fl.flock(File::LOCK_EX) + File.open(@config["lock_file"], File::RDWR | File::CREAT, 0o600) do |fl| + # take the lock in non-blocking mode. if this process doesn't get the lock + # then another will be run from cron shortly. this prevents a whole bunch + # of processes queueing on the lock and causing weirdness if/when they + # get woken up in a random order. + got_lock = fl.flock(File::LOCK_EX | File::LOCK_NB) + return unless got_lock # try and write the files to tmp locations and then # move them into place later, to avoid in-progress diff --git a/cookbooks/planet/recipes/replication.rb b/cookbooks/planet/recipes/replication.rb index b23c00f36..f1be1d0d0 100644 --- a/cookbooks/planet/recipes/replication.rb +++ b/cookbooks/planet/recipes/replication.rb @@ -127,6 +127,12 @@ directory "/etc/replication" do mode 0o755 end +directory "/var/run/lock/changeset-replication/" do + owner "planet" + group "planet" + mode 0o750 +end + template "/etc/replication/auth.conf" do source "replication.auth.erb" user "root" diff --git a/cookbooks/planet/templates/default/changesets.conf.erb b/cookbooks/planet/templates/default/changesets.conf.erb index 309f253b4..73fc83e17 100644 --- a/cookbooks/planet/templates/default/changesets.conf.erb +++ b/cookbooks/planet/templates/default/changesets.conf.erb @@ -1,3 +1,4 @@ state_file: /store/planet/replication/changesets/state.yaml db: host=<%= node[:web][:database_host] %> dbname=openstreetmap user=planetdiff password=<%= @password %> data_dir: /store/planet/replication/changesets +lock_file: /var/run/lock/changeset-replication/lockfile -- 2.39.5