From 74aa47828d1013e7c13bee0ec9fd5b1901f20f76 Mon Sep 17 00:00:00 2001
From: Devon Hudson <devon.dmytro@gmail.com>
Date: Wed, 12 Feb 2025 14:37:56 +0000
Subject: [PATCH] Add log message when worker lock timeouts get large (#18124)

This is to help track down a possible, but very rare, worker deadlock
that was seen on matrix.org.
In theory, you could work back from an instance of these new logs to the
approximate time when the lock was obtained and focus the diagnostic
efforts there.

### Pull Request Checklist

<!-- Please read
https://element-hq.github.io/synapse/latest/development/contributing_guide.html
before submitting your pull request -->

* [x] Pull request is based on the develop branch
* [x] Pull request includes a [changelog
file](https://element-hq.github.io/synapse/latest/development/contributing_guide.html#changelog).
The entry should:
- Be a short description of your change which makes sense to users.
"Fixed a bug that prevented receiving messages from other servers."
instead of "Moved X method from `EventStore` to `EventWorkerStore`.".
  - Use markdown where necessary, mostly for `code blocks`.
  - End with either a period (.) or an exclamation mark (!).
  - Start with a capital letter.
- Feel free to credit yourself, by adding a sentence "Contributed by
@github_username." or "Contributed by [Your Name]." to the end of the
entry.
* [x] [Code
style](https://element-hq.github.io/synapse/latest/code_style.html) is
correct
(run the
[linters](https://element-hq.github.io/synapse/latest/development/contributing_guide.html#run-the-linters))
---
 changelog.d/18124.misc          | 1 +
 synapse/handlers/worker_lock.py | 9 +++++++++
 2 files changed, 10 insertions(+)
 create mode 100644 changelog.d/18124.misc

diff --git a/changelog.d/18124.misc b/changelog.d/18124.misc
new file mode 100644
index 0000000000..8ac6a73a20
--- /dev/null
+++ b/changelog.d/18124.misc
@@ -0,0 +1 @@
+Add log message when worker lock timeouts get large.
diff --git a/synapse/handlers/worker_lock.py b/synapse/handlers/worker_lock.py
index db998f6701..e58a416026 100644
--- a/synapse/handlers/worker_lock.py
+++ b/synapse/handlers/worker_lock.py
@@ -19,6 +19,7 @@
 #
 #
 
+import logging
 import random
 from types import TracebackType
 from typing import (
@@ -269,6 +270,10 @@ class WaitingLock:
     def _get_next_retry_interval(self) -> float:
         next = self._retry_interval
         self._retry_interval = max(5, next * 2)
+        if self._retry_interval > 5 * 2 ^ 7:  # ~10 minutes
+            logging.warning(
+                f"Lock timeout is getting excessive: {self._retry_interval}s. There may be a deadlock."
+            )
         return next * random.uniform(0.9, 1.1)
 
 
@@ -344,4 +349,8 @@ class WaitingMultiLock:
     def _get_next_retry_interval(self) -> float:
         next = self._retry_interval
         self._retry_interval = max(5, next * 2)
+        if self._retry_interval > 5 * 2 ^ 7:  # ~10 minutes
+            logging.warning(
+                f"Lock timeout is getting excessive: {self._retry_interval}s. There may be a deadlock."
+            )
         return next * random.uniform(0.9, 1.1)
-- 
GitLab