-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Description
Environment
- ejabberd version: 24.12
- Erlang version:
14.2.1 - OS: Linux (Debian)
- Installed from: docker
Configuration
loglevel: info
log_rotate_size: 1048576000
log_rotate_count: 7
certfiles:
- "/home/ejabberd/conf/domain.pem"
ca_file: "/home/ejabberd/conf/cacert.pem"
sql_type: mysql
sql_pool_size: 20
sql_keepalive_interval: 1
sql_start_interval: 5
new_sql_schema: false
update_sql_schema: false
default_db: sql
default_ram_db: mnesia
auth_method: sql
allow_contrib_modules: true
cache_size: 20000
max_fsm_queue: 30000
listen:
- port: 5222
ip: "::"
module: ejabberd_c2s
protocol_options:
- "no_sslv2"
- "no_sslv3"
- "no_tlsv1"
- "no_tlsv1_1"
ciphers: "my cipher"
starttls: true
starttls_required: false
tls_compression: false
max_stanza_size: 262144
shaper: c2s_shaper
access: c2s
use_proxy_protocol: false
- port: 5269
ip: "::"
module: ejabberd_s2s_in
max_stanza_size: 524288
- port: 5443
ip: "::"
module: ejabberd_http
request_handlers:
/upload: mod_my_http_upload
/ws: ejabberd_http_ws
/bosh: mod_bosh
captcha: false
use_proxy_protocol: false
tls: false
- port: 5280
ip: "::"
module: ejabberd_http
request_handlers:
/admin: ejabberd_web_admin
/api/v0: mod_http_api
/api: mod_http_api
captcha: false
use_proxy_protocol: false
tls: false
- port: 9334
ip: "::"
module: ejabberd_http
request_handlers:
/metrics: mod_prometheus
captcha: false
use_proxy_protocol: false
tls: false
s2s_use_starttls: optional
acl:
local:
user_regexp: ""
loopback:
ip:
- 127.0.0.0/8
- ::1/128
admin:
user:
- "[email protected]"
access_rules:
local:
allow: local
c2s:
deny: blocked
allow: all
announce:
allow: admin
configure:
allow: admin
muc_create:
allow: local
pubsub_createnode:
allow: local
register:
allow: local
trusted_network:
allow: loopback
api_permissions:
"console commands":
from:
- ejabberd_ctl
who: all
what: "*"
"admin access":
who:
- ip: "127.0.0.0/8"
- access:
- allow:
- acl: loopback
- acl: admin
what:
- "*"
- "!stop"
- "!start"
"public commands":
who:
ip: 127.0.0.1/8
what:
- status
- connected_users_number
shaper:
normal:
rate: 3000
burst_size: 20000
fast: 100000
shaper_rules:
max_user_sessions: 10
max_user_offline_messages:
5000: admin
500: all
c2s_shaper:
none: admin
normal: all
s2s_shaper: fast
acme:
auto: false
contact: "mailto:[email protected]"
ca_url: "https://acme-v02.api.letsencrypt.org/directory"
modules:
mod_adhoc: {}
# mod_admin_update_sql: {}
mod_admin_extra: {}
mod_announce:
access: announce
mod_avatar: {}
mod_blocking: {}
mod_bosh: {}
mod_caps:
use_cache: true
mod_carboncopy: {}
mod_client_state: {}
mod_configure: {}
mod_disco: {}
# mod_fail2ban: {}
mod_http_api:
default_version: 0
# mod_http_upload: {}
mod_last: {}
mod_mam:
## Mnesia is limited to 2GB, better to use an SQL backend
## For small servers SQLite is a good fit and is very easy
## to configure. Uncomment this when you have SQL configured:
db_type: sql
assume_mam_usage: true
default: always
user_mucsub_from_muc_archive: true
# mod_mqtt: {}
mod_muc:
access:
- allow
access_admin:
- allow: admin
access_create:
- allow: admin
access_persistent: muc_create
access_mam:
- allow
default_room_options:
allow_user_invites: false
allow_subscription: true
allow_change_subj: false
allow_query_users: true
allowpm: anyone
mam: true
members_by_default: true
members_only: false
logging: true
persistent: true
anonymous: false
public: false
presence_broadcast:
- visitor
history_size: 0
max_users: 5000
max_user_conferences: 5000
preload_rooms: false
mod_muc_admin: {
subscribe_room_many_max_users: 4000
}
mod_offline:
access_max_user_messages: max_user_offline_messages
mod_ping:
send_pings: true
ping_interval: 60
ping_ack_timeout: 30
timeout_action: kill
mod_privacy: {}
# mod_private: {}
mod_proxy65:
access: local
max_connections: 5
mod_pubsub:
access_createnode: pubsub_createnode
## reduces resource comsumption, but XEP incompliant
ignore_pep_from_offline: false
## XEP compliant, but increases resource comsumption
## ignore_pep_from_offline: false
last_item_cache: false
max_items_node: 1000
plugins:
- flat
- pep
force_node_config:
## Change from "whitelist" to "open" to enable OMEMO support
## See https://github.com/processone/ejabberd/issues/2425
"eu.siacs.conversations.axolotl.*":
access_model: whitelist
## Avoid buggy clients to make their bookmarks public
storage:bookmarks:
access_model: whitelist
service:roster:x:
access_model: presence
notification_type: normal
mod_push: {}
mod_push_keepalive: {}
mod_register:
## Only accept registration requests from the "trusted"
## network (see access_rules section above).
## Think twice before enabling registration from any
## address. See the Jabber SPAM Manifesto for details:
## https://github.com/ge0rg/jabber-spam-fighting-manifesto
ip_access: trusted_network
mod_roster:
versioning: true
store_current_id: true
db_type: sql
cache_size: 80000
mod_s2s_dialback: {}
mod_shared_roster: {}
mod_stream_mgmt:
resend_on_timeout: true
resume_timeout: 30
max_ack_queue: 10000
# mod_stun_disco: {}
mod_vcard: {}
mod_vcard_xupdate: {}
mod_version:
show_os: false
...Bug description
We are experiencing a critical "thundering herd" scenario following cluster instability. When a cluster node fails and users are disconnected, if the outage exceeds the resume_timeout period, all clients attempt a full reconnection simultaneously once the service becomes available.
This results in a massive spike in database load (CPU and Memory) as thousands of clients perform full authentication and session binding at the same time. In our recent incident, the database became completely unresponsive (choked), leading to a "reconnection recycle" loop where the DB couldn't process enough requests to stabilize the cluster.
Resolution Observed:
We were only able to stabilize the system after scaling our DB resources (CPU and Memory) by 3x to handle the initial surge.
I see that for each reconnection, the following queries are being run in the DB:
2025-06-10 15:30:38.213780+00:00 [debug] <0.646.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "select version from roster_version where username = 'username' and 0=0"
2025-06-10 15:30:38.246180+00:00 [debug] <0.648.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "begin;"
2025-06-10 15:30:38.246621+00:00 [debug] <0.648.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "select username, xml from spool where username='username' and 0=0 order by seq;"
2025-06-10 15:30:38.247114+00:00 [debug] <0.648.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "delete from spool where username='username' and 0=0;"
2025-06-10 15:30:38.247336+00:00 [debug] <0.648.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "commit;"
2025-06-10 15:30:38.247600+00:00 [debug] <0.650.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "select username, jid, nick, subscription, ask, askmessage, server, subscribe, type from rosterusers where username='username' and 0=0"
2025-06-10 15:30:38.248444+00:00 [debug] <0.652.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "select jid, grp from rostergroups where username='username' and 0=0"
2025-06-10 15:30:38.249294+00:00 [debug] <0.654.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "select node, plugin, i.nodeid, jid, subscriptions from pubsub_state i, pubsub_node n where i.nodeid = n.nodeid and jid in ('[email protected]/6186', '[email protected]') and host='pubsub.myserver.com'"
2025-06-10 15:30:38.249978+00:00 [debug] <0.654.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "select name, val from pubsub_node_option where nodeid=6354"
2025-06-10 15:30:38.250523+00:00 [debug] <0.675.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "select itemid, publisher, creation, modification, payload from pubsub_item where nodeid='6354' order by modification desc limit 1"
2025-06-10 15:30:38.252298+00:00 [debug] <0.617.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "select host, node, plugin, i.nodeid, jid, subscriptions from pubsub_state i, pubsub_node n where i.nodeid = n.nodeid and jid in ('[email protected]/186','[email protected]') and host like '%@myserver.com' "
2025-06-10 15:30:38.252743+00:00 [debug] <0.617.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "select name, val from pubsub_node_option where nodeid=11939"
2025-06-10 15:30:38.253870+00:00 [debug] <0.617.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "select name, val from pubsub_node_option where nodeid=11940"
2025-06-10 15:30:38.254270+00:00 [debug] <0.620.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "select username, jid, nick, subscription, ask, askmessage, server, subscribe, type from rosterusers where username='username' and 0=0"
2025-06-10 15:30:38.255223+00:00 [debug] <0.622.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "select jid, grp from rostergroups where username='username' and 0=0"
2025-06-10 15:30:38.301391+00:00 [debug] <0.624.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "select node, parent, plugin, nodeid from pubsub_node where host='[email protected]' and node='storage:bookmarks'"
2025-06-10 15:30:38.305027+00:00 [debug] <0.626.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "select subscription, ask from rosterusers where username='username' and 0=0 and jid='[email protected]'"
2025-06-10 15:30:38.306016+00:00 [debug] <0.628.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "select node, parent, plugin, nodeid from pubsub_node where host='[email protected]'"
2025-06-10 15:30:38.306543+00:00 [debug] <0.628.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "select name, val from pubsub_node_option where nodeid=11939"
2025-06-10 15:30:38.306958+00:00 [debug] <0.628.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "select name, val from pubsub_node_option where nodeid=11940"
2025-06-10 15:30:38.307450+00:00 [debug] <0.630.0>@ejabberd_sql:sql_query_internal/1:760 SQL: "select itemid, publisher, creation, modification, payload from pubsub_item where nodeid='11939'"
Is there and configuration or something else we can do to avoid this situation?