From f694bb71b7ea7841a5b5db3d884dfda5a3f78023 Mon Sep 17 00:00:00 2001
From: Eric Eastwood <erice@element.io>
Date: Fri, 9 Sep 2022 11:30:06 -0500
Subject: [PATCH] Strip number suffix from instance name to consolidate
 services that traces are spread over (#13729)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The problem with many services is that it makes it hard to find which service has the trace you want, see https://github.com/jaegertracing/jaeger-ui/issues/985

Previously, we split traces out into services based on their instance name like `matrix.org client_reader-1`, etc but there are many worker instances of the same `client_reader` so there is a lot to click through.

With this PR, all of the traces are just collected under the worker type like `client_reader`, `event_persister` 😇

Note: A Synapse worker instance name is an opaque string with the number convention only being our own thing for the `matrix.org` deployment. But seems pretty sensible to group things this way.
---
 changelog.d/13729.misc         |  1 +
 synapse/logging/opentracing.py | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)
 create mode 100644 changelog.d/13729.misc

diff --git a/changelog.d/13729.misc b/changelog.d/13729.misc
new file mode 100644
index 0000000000..c6a6f617e3
--- /dev/null
+++ b/changelog.d/13729.misc
@@ -0,0 +1 @@
+Strip number suffix from instance name to consolidate services that traces are spread over.
diff --git a/synapse/logging/opentracing.py b/synapse/logging/opentracing.py
index 482316a1ff..adf3f54770 100644
--- a/synapse/logging/opentracing.py
+++ b/synapse/logging/opentracing.py
@@ -203,6 +203,9 @@ if TYPE_CHECKING:
 
 # Helper class
 
+# Matches the number suffix in an instance name like "matrix.org client_reader-8"
+STRIP_INSTANCE_NUMBER_SUFFIX_REGEX = re.compile(r"[_-]?\d+$")
+
 
 class _DummyTagNames:
     """wrapper of opentracings tags. We need to have them if we
@@ -441,9 +444,17 @@ def init_tracer(hs: "HomeServer") -> None:
 
     from jaeger_client.metrics.prometheus import PrometheusMetricsFactory
 
+    # Instance names are opaque strings but by stripping off the number suffix,
+    # we can get something that looks like a "worker type", e.g.
+    # "client_reader-1" -> "client_reader" so we don't spread the traces across
+    # so many services.
+    instance_name_by_type = re.sub(
+        STRIP_INSTANCE_NUMBER_SUFFIX_REGEX, "", hs.get_instance_name()
+    )
+
     config = JaegerConfig(
         config=hs.config.tracing.jaeger_config,
-        service_name=f"{hs.config.server.server_name} {hs.get_instance_name()}",
+        service_name=f"{hs.config.server.server_name} {instance_name_by_type}",
         scope_manager=LogContextScopeManager(),
         metrics_factory=PrometheusMetricsFactory(),
     )
-- 
GitLab