From e2c300e7e4647500b0ff5ada8018cc984b78f91c Mon Sep 17 00:00:00 2001
From: Michael Kaye <1917473+michaelkaye@users.noreply.github.com>
Date: Fri, 26 Nov 2021 14:05:20 +0000
Subject: [PATCH] Create healthcheck script for synapse-workers container
 (#11429)

The intent is to iterate through all the worker ports and only
report healthy when all are healthy, starting with the main process.
---
 changelog.d/11429.docker              |  1 +
 docker/Dockerfile-workers             |  3 +++
 docker/conf-workers/healthcheck.sh.j2 |  6 ++++++
 docker/configure_workers_and_start.py | 13 +++++++++++++
 4 files changed, 23 insertions(+)
 create mode 100644 changelog.d/11429.docker
 create mode 100644 docker/conf-workers/healthcheck.sh.j2

diff --git a/changelog.d/11429.docker b/changelog.d/11429.docker
new file mode 100644
index 0000000000..81db719ed6
--- /dev/null
+++ b/changelog.d/11429.docker
@@ -0,0 +1 @@
+Update `Dockerfile-workers` to healthcheck all workers in container.
diff --git a/docker/Dockerfile-workers b/docker/Dockerfile-workers
index 969cf97286..46f2e17382 100644
--- a/docker/Dockerfile-workers
+++ b/docker/Dockerfile-workers
@@ -21,3 +21,6 @@ VOLUME ["/data"]
 # files to run the desired worker configuration. Will start supervisord.
 COPY ./docker/configure_workers_and_start.py /configure_workers_and_start.py
 ENTRYPOINT ["/configure_workers_and_start.py"]
+
+HEALTHCHECK --start-period=5s --interval=15s --timeout=5s \
+    CMD /bin/sh /healthcheck.sh
diff --git a/docker/conf-workers/healthcheck.sh.j2 b/docker/conf-workers/healthcheck.sh.j2
new file mode 100644
index 0000000000..79c621f89c
--- /dev/null
+++ b/docker/conf-workers/healthcheck.sh.j2
@@ -0,0 +1,6 @@
+#!/bin/sh
+# This healthcheck script is designed to return OK when every 
+# host involved returns OK
+{%- for healthcheck_url in healthcheck_urls %}
+curl -fSs {{ healthcheck_url }} || exit 1
+{%- endfor %}
diff --git a/docker/configure_workers_and_start.py b/docker/configure_workers_and_start.py
index f4ac1c22a4..adbb551cee 100755
--- a/docker/configure_workers_and_start.py
+++ b/docker/configure_workers_and_start.py
@@ -474,10 +474,16 @@ def generate_worker_files(environ, config_path: str, data_dir: str):
 
     # Determine the load-balancing upstreams to configure
     nginx_upstream_config = ""
+
+    # At the same time, prepare a list of internal endpoints to healthcheck
+    # starting with the main process which exists even if no workers do.
+    healthcheck_urls = ["http://localhost:8080/health"]
+
     for upstream_worker_type, upstream_worker_ports in nginx_upstreams.items():
         body = ""
         for port in upstream_worker_ports:
             body += "    server localhost:%d;\n" % (port,)
+            healthcheck_urls.append("http://localhost:%d/health" % (port,))
 
         # Add to the list of configured upstreams
         nginx_upstream_config += NGINX_UPSTREAM_CONFIG_BLOCK.format(
@@ -510,6 +516,13 @@ def generate_worker_files(environ, config_path: str, data_dir: str):
         worker_config=supervisord_config,
     )
 
+    # healthcheck config
+    convert(
+        "/conf/healthcheck.sh.j2",
+        "/healthcheck.sh",
+        healthcheck_urls=healthcheck_urls,
+    )
+
     # Ensure the logging directory exists
     log_dir = data_dir + "/logs"
     if not os.path.exists(log_dir):
-- 
GitLab