Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
synapse
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Terraform modules
Monitor
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Timo Ley
synapse
Commits
057f04fa
Commit
057f04fa
authored
4 years ago
by
Richard van der Hoff
Browse files
Options
Downloads
Patches
Plain Diff
Report state res metrics to Prometheus and log
parent
8412c08a
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
synapse/state/__init__.py
+124
-20
124 additions, 20 deletions
synapse/state/__init__.py
with
124 additions
and
20 deletions
synapse/state/__init__.py
+
124
−
20
View file @
057f04fa
...
...
@@ -13,42 +13,46 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
heapq
import
logging
from
collections
import
namedtuple
from
collections
import
defaultdict
,
namedtuple
from
typing
import
(
Any
,
Awaitable
,
Callable
,
DefaultDict
,
Dict
,
Iterable
,
List
,
Optional
,
Sequence
,
Set
,
Tuple
,
Union
,
overload
,
)
import
attr
from
frozendict
import
frozendict
from
prometheus_client
import
Histogram
from
prometheus_client
import
Counter
,
Histogram
from
typing_extensions
import
Literal
from
synapse.api.constants
import
EventTypes
from
synapse.api.room_versions
import
KNOWN_ROOM_VERSIONS
,
StateResolutionVersions
from
synapse.events
import
EventBase
from
synapse.events.snapshot
import
EventContext
from
synapse.logging.context
import
ContextResourceUsage
from
synapse.logging.utils
import
log_function
from
synapse.state
import
v1
,
v2
from
synapse.storage.databases.main.events_worker
import
EventRedactBehaviour
from
synapse.storage.roommember
import
ProfileInfo
from
synapse.types
import
Collection
,
StateMap
from
synapse.util
import
Clock
from
synapse.util.async_helpers
import
Linearizer
from
synapse.util.caches.expiringcache
import
ExpiringCache
from
synapse.util.metrics
import
Measure
,
measure_func
logger
=
logging
.
getLogger
(
__name__
)
metrics_logger
=
logging
.
getLogger
(
"
synapse.state.metrics
"
)
# Metrics for number of state groups involved in a resolution.
state_groups_histogram
=
Histogram
(
...
...
@@ -459,6 +463,33 @@ class StateHandler:
return
{
key
:
state_map
[
ev_id
]
for
key
,
ev_id
in
new_state
.
items
()}
@attr.s
(
slots
=
True
)
class
_StateResMetrics
:
"""
Keeps track of some usage metrics about state res.
"""
# System and User CPU time, in seconds
cpu_time
=
attr
.
ib
(
type
=
float
,
default
=
0.0
)
# time spent on database transactions (excluding scheduling time). This roughly
# corresponds to the amount of work done on the db server, excluding event fetches.
db_time
=
attr
.
ib
(
type
=
float
,
default
=
0.0
)
# number of events fetched from the db.
db_events
=
attr
.
ib
(
type
=
int
,
default
=
0
)
_biggest_room_by_cpu_counter
=
Counter
(
"
synapse_state_res_cpu_for_biggest_room_seconds
"
,
"
CPU time spent performing state resolution for the single most expensive
"
"
room for state resolution
"
,
)
_biggest_room_by_db_counter
=
Counter
(
"
synapse_state_res_db_for_biggest_room_seconds
"
,
"
Database time spent performing state resolution for the single most
"
"
expensive room for state resolution
"
,
)
class
StateResolutionHandler
:
"""
Responsible for doing state conflict resolution.
...
...
@@ -481,6 +512,17 @@ class StateResolutionHandler:
reset_expiry_on_get
=
True
,
)
#
# stuff for tracking time spent on state-res by room
#
# tracks the amount of work done on state res per room
self
.
_state_res_metrics
=
defaultdict
(
_StateResMetrics
)
# type: DefaultDict[str, _StateResMetrics]
self
.
clock
.
looping_call
(
self
.
_report_metrics
,
120
*
1000
)
@log_function
async
def
resolve_state_groups
(
self
,
...
...
@@ -578,21 +620,83 @@ class StateResolutionHandler:
Returns:
a map from (type, state_key) to event_id.
"""
with
Measure
(
self
.
clock
,
"
state._resolve_events
"
):
v
=
KNOWN_ROOM_VERSIONS
[
room_version
]
if
v
.
state_res
==
StateResolutionVersions
.
V1
:
return
await
v1
.
resolve_events_with_store
(
room_id
,
state_sets
,
event_map
,
state_res_store
.
get_events
)
else
:
return
await
v2
.
resolve_events_with_store
(
self
.
clock
,
room_id
,
room_version
,
state_sets
,
event_map
,
state_res_store
,
)
try
:
with
Measure
(
self
.
clock
,
"
state._resolve_events
"
)
as
m
:
v
=
KNOWN_ROOM_VERSIONS
[
room_version
]
if
v
.
state_res
==
StateResolutionVersions
.
V1
:
return
await
v1
.
resolve_events_with_store
(
room_id
,
state_sets
,
event_map
,
state_res_store
.
get_events
)
else
:
return
await
v2
.
resolve_events_with_store
(
self
.
clock
,
room_id
,
room_version
,
state_sets
,
event_map
,
state_res_store
,
)
finally
:
self
.
_record_state_res_metrics
(
room_id
,
m
.
get_resource_usage
())
def
_record_state_res_metrics
(
self
,
room_id
:
str
,
rusage
:
ContextResourceUsage
):
room_metrics
=
self
.
_state_res_metrics
[
room_id
]
room_metrics
.
cpu_time
+=
rusage
.
ru_utime
+
rusage
.
ru_stime
room_metrics
.
db_time
+=
rusage
.
db_txn_duration_sec
room_metrics
.
db_events
+=
rusage
.
evt_db_fetch_count
def
_report_metrics
(
self
):
if
not
self
.
_state_res_metrics
:
# no state res has happened since the last iteration: don't bother logging.
return
self
.
_report_biggest
(
lambda
i
:
i
.
cpu_time
,
"
CPU time
"
,
_biggest_room_by_cpu_counter
,
)
self
.
_report_biggest
(
lambda
i
:
i
.
db_time
,
"
DB time
"
,
_biggest_room_by_db_counter
,
)
self
.
_state_res_metrics
.
clear
()
def
_report_biggest
(
self
,
extract_key
:
Callable
[[
_StateResMetrics
],
Any
],
metric_name
:
str
,
prometheus_counter_metric
:
Counter
,
)
->
None
:
"""
Report metrics on the biggest rooms for state res
Args:
extract_key: a callable which, given a _StateResMetrics, extracts a single
metric to sort by.
metric_name: the name of the metric we have extracted, for the log line
prometheus_counter_metric: a prometheus metric recording the sum of the
the extracted metric
"""
n_to_log
=
10
if
not
metrics_logger
.
isEnabledFor
(
logging
.
DEBUG
):
# only need the most expensive if we don't have debug logging, which
# allows nlargest() to degrade to max()
n_to_log
=
1
items
=
self
.
_state_res_metrics
.
items
()
# log the N biggest rooms
biggest
=
heapq
.
nlargest
(
n_to_log
,
items
,
key
=
lambda
i
:
extract_key
(
i
[
1
])
)
# type: List[Tuple[str, _StateResMetrics]]
metrics_logger
.
debug
(
"
%i biggest rooms for state-res by %s: %s
"
,
len
(
biggest
),
metric_name
,
[
"
%s (%gs)
"
%
(
r
,
extract_key
(
m
))
for
(
r
,
m
)
in
biggest
],
)
# report info on the single biggest to prometheus
_
,
biggest_metrics
=
biggest
[
0
]
prometheus_counter_metric
.
inc
(
extract_key
(
biggest_metrics
))
def
_make_state_cache_entry
(
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment