Skip to content
Snippets Groups Projects
Unverified Commit a2d7195e authored by David Robertson's avatar David Robertson Committed by GitHub
Browse files

Track why we're evicting from caches (#10829)

So we can see distinguish between "evicting because the cache is too big" and "evicting because the cache entries haven't been recently used".
parent 51e2db35
No related branches found
No related tags found
No related merge requests found
Track cache eviction rates more finely in Prometheus' monitoring.
\ No newline at end of file
......@@ -12,8 +12,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import logging
import typing
from enum import Enum, auto
from sys import intern
from typing import Callable, Dict, Optional, Sized
......@@ -34,7 +36,7 @@ collectors_by_name: Dict[str, "CacheMetric"] = {}
cache_size = Gauge("synapse_util_caches_cache:size", "", ["name"])
cache_hits = Gauge("synapse_util_caches_cache:hits", "", ["name"])
cache_evicted = Gauge("synapse_util_caches_cache:evicted_size", "", ["name"])
cache_evicted = Gauge("synapse_util_caches_cache:evicted_size", "", ["name", "reason"])
cache_total = Gauge("synapse_util_caches_cache:total", "", ["name"])
cache_max_size = Gauge("synapse_util_caches_cache_max_size", "", ["name"])
cache_memory_usage = Gauge(
......@@ -46,11 +48,16 @@ cache_memory_usage = Gauge(
response_cache_size = Gauge("synapse_util_caches_response_cache:size", "", ["name"])
response_cache_hits = Gauge("synapse_util_caches_response_cache:hits", "", ["name"])
response_cache_evicted = Gauge(
"synapse_util_caches_response_cache:evicted_size", "", ["name"]
"synapse_util_caches_response_cache:evicted_size", "", ["name", "reason"]
)
response_cache_total = Gauge("synapse_util_caches_response_cache:total", "", ["name"])
class EvictionReason(Enum):
size = auto()
time = auto()
@attr.s(slots=True)
class CacheMetric:
......@@ -61,7 +68,9 @@ class CacheMetric:
hits = attr.ib(default=0)
misses = attr.ib(default=0)
evicted_size = attr.ib(default=0)
eviction_size_by_reason: typing.Counter[EvictionReason] = attr.ib(
factory=collections.Counter
)
memory_usage = attr.ib(default=None)
def inc_hits(self) -> None:
......@@ -70,8 +79,8 @@ class CacheMetric:
def inc_misses(self) -> None:
self.misses += 1
def inc_evictions(self, size: int = 1) -> None:
self.evicted_size += size
def inc_evictions(self, reason: EvictionReason, size: int = 1) -> None:
self.eviction_size_by_reason[reason] += size
def inc_memory_usage(self, memory: int) -> None:
if self.memory_usage is None:
......@@ -94,14 +103,20 @@ class CacheMetric:
if self._cache_type == "response_cache":
response_cache_size.labels(self._cache_name).set(len(self._cache))
response_cache_hits.labels(self._cache_name).set(self.hits)
response_cache_evicted.labels(self._cache_name).set(self.evicted_size)
for reason in EvictionReason:
response_cache_evicted.labels(self._cache_name, reason.name).set(
self.eviction_size_by_reason[reason]
)
response_cache_total.labels(self._cache_name).set(
self.hits + self.misses
)
else:
cache_size.labels(self._cache_name).set(len(self._cache))
cache_hits.labels(self._cache_name).set(self.hits)
cache_evicted.labels(self._cache_name).set(self.evicted_size)
for reason in EvictionReason:
cache_evicted.labels(self._cache_name, reason.name).set(
self.eviction_size_by_reason[reason]
)
cache_total.labels(self._cache_name).set(self.hits + self.misses)
if getattr(self._cache, "max_size", None):
cache_max_size.labels(self._cache_name).set(self._cache.max_size)
......
......@@ -22,7 +22,7 @@ from typing_extensions import Literal
from synapse.config import cache as cache_config
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.util import Clock
from synapse.util.caches import register_cache
from synapse.util.caches import EvictionReason, register_cache
logger = logging.getLogger(__name__)
......@@ -98,9 +98,9 @@ class ExpiringCache(Generic[KT, VT]):
while self._max_size and len(self) > self._max_size:
_key, value = self._cache.popitem(last=False)
if self.iterable:
self.metrics.inc_evictions(len(value.value))
self.metrics.inc_evictions(EvictionReason.size, len(value.value))
else:
self.metrics.inc_evictions()
self.metrics.inc_evictions(EvictionReason.size)
def __getitem__(self, key: KT) -> VT:
try:
......@@ -175,9 +175,9 @@ class ExpiringCache(Generic[KT, VT]):
for k in keys_to_delete:
value = self._cache.pop(k)
if self.iterable:
self.metrics.inc_evictions(len(value.value))
self.metrics.inc_evictions(EvictionReason.time, len(value.value))
else:
self.metrics.inc_evictions()
self.metrics.inc_evictions(EvictionReason.time)
logger.debug(
"[%s] _prune_cache before: %d, after len: %d",
......
......@@ -40,7 +40,7 @@ from twisted.internet.interfaces import IReactorTime
from synapse.config import cache as cache_config
from synapse.metrics.background_process_metrics import wrap_as_background_process
from synapse.util import Clock, caches
from synapse.util.caches import CacheMetric, register_cache
from synapse.util.caches import CacheMetric, EvictionReason, register_cache
from synapse.util.caches.treecache import TreeCache, iterate_tree_cache_entry
from synapse.util.linked_list import ListNode
......@@ -403,7 +403,7 @@ class LruCache(Generic[KT, VT]):
evicted_len = delete_node(node)
cache.pop(node.key, None)
if metrics:
metrics.inc_evictions(evicted_len)
metrics.inc_evictions(EvictionReason.size, evicted_len)
def synchronized(f: FT) -> FT:
@wraps(f)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment