Hi,
In these few days, my haproxy suddenly getting crazy and a little bit out of control
It was in version 1.8.8 for more than a year and work fine.
But suddenly keep eat up all memory even swap. It will get OOM about every two hours
I have try to fine-tune and upgrade to version 1.9.12.
But still have no luck.
Below are the details:
ulimit
core file size (blocks, -c) 0
data seg size (kbytes, -d) unlimited
scheduling priority (-e) 0
file size (blocks, -f) unlimited
pending signals (-i) 95690
max locked memory (kbytes, -l) 2000000000
max memory size (kbytes, -m) 20000000000
open files (-n) 1024
pipe size (512 bytes, -p) 8
POSIX message queues (bytes, -q) 819200
real-time priority (-r) 0
stack size (kbytes, -s) 8192
cpu time (seconds, -t) unlimited
max user processes (-u) 95690
virtual memory (kbytes, -v) unlimited
file locks (-x) unlimited
haproxy.cfg
#---------------------------------------------------------------------
# Global settings
#---------------------------------------------------------------------
global
# log 127.0.0.1 local2
chroot /var/lib/haproxy
pidfile /tmp/haproxy.pid
maxconn 8000
# ulimit-n 100000
log 127.0.0.1 local2
daemon
nbproc 2
# nbthread 2
cpu-map 1 0
cpu-map 2 1
stats bind-process 1
# turn on stats unix socket
stats socket /var/lib/haproxy/stats mode 600 level admin
debug
#quiet
ssl-default-bind-ciphers kEECDH+aRSA+AES:kRSA+AES:+AES256:RC4-SHA:!kEDH:!LOW:!EXP:!MD5:!aNULL:!eNULL
# ssl-default-bind-options no-sslv3
#---------------------------------------------------------------------
# common defaults that all the 'listen' and 'backend' sections will
# use if not designated in their block
#---------------------------------------------------------------------
defaults
mode http
log global
option dontlognull
## option forwardfor except 127.0.0.0/8
option dontlog-normal
option clitcpka
option redispatch
option http-server-close
retries 3
timeout http-request 10s
timeout queue 1m
timeout connect 30s
# timeout connect 5m
timeout client 1m
# timeout client 30m
timeout server 1m
maxconn 3000
# timeout server 30m
#timeout http-keep-alive 10s
#timeout check 10s
# HAProxy Stats - http://10.62.232.27:1900
listen haproxy_stats
mode http
bind *:1900
###log-format %ci:%cp\ [%t]\ %ft\ %b/%s\ %Tq/%Tw/%Tc/%Tr/%Tt\ %ST\ %B\ %CC\ %CS\ %tsc\ %ac/%fc/%bc/%sc/%rc\ %sq/%bq\ %hr\ %hs\ %{+Q}r\ %ri
# option httplog
# log global
timeout client 1h
stats enable
stats hide-version
stats realm Haproxy\ Statistics
stats uri /
stats auth haproxy:haproxy
stats refresh 2s
bind-process 1
### incoming frontend
### tcp frontend
frontend tcp-proxy
bind *:9996
mode tcp
# log global
# maxconn 8000
tcp-request inspect-delay 3s
tcp-request content capture req.payload(0,0) len 400000
tcp-request content capture req.len len 400000
tcp-request content accept if WAIT_END
### if match type mgslogging
acl mgslogging req.payload(0,0) -m reg (\"type\":\"MGS-Logging\")
use_backend mgslogging_backend if mgslogging
default_backend dft_backend
### tcp frontend for netuitive
frontend tcp-proxy-netuitive
bind *:9993
mode tcp
# log global
maxconn 100
default_backend netuitive_backend
### riemann metrics frontend
frontend metrics-proxy
bind *:9994
mode tcp
# log global
maxconn 2000
default_backend default_metrics_backend
### metricbeat frontend
#frontend metricbeat
# bind *:9983
# mode tcp
# log global
# maxconn 8000
# timeout client 10800
# default_backend metricsbeat_backend
### decaf frontend
frontend decaf
bind *:9991
mode tcp
# log global
maxconn 300
default_backend decaf_backend
### new bpm frontend
#frontend nifi-bpm
# bind *:9981
# mode tcp
# log global
# maxconn 300
# timeout client 10800
# default_backend nifi_bpm_backend
### iislog
frontend iislog
bind *:9990
mode tcp
# log global
maxconn 3500
option clitcpka
default_backend iislogs_backend
### sqljobs
frontend sqljobs
bind *:9986
mode tcp
# log global
maxconn 2000
option clitcpka
default_backend sqljob_backend
### wineventlog frontend
frontend wineventlog
bind *:9982
mode tcp
# log global
maxconn 6000
option clitcpka
# default_backend wineventlog_backend
default_backend wineventlog_beat_backend
### http frontend
frontend http-proxy
bind *:9983
mode http
# log global
option http-server-close
acl cert_info path_beg /cert_info
acl cluster_info path_beg /cluster_info
acl ambari_info path_beg /ambari_info
use_backend cert_info_http_backend if cert_info
use_backend cluster_info_http_backend if cluster_info
use_backend ambari_info_http_backend if ambari_info
### default backend
### Port: 9996
### In Use: Yes
backend dft_backend
mode tcp
balance roundrobin
option tcpka
# option tcp-check
retries 2
server tng2087 tng2087:9996 check maxconn 1024
### mgslogging_backend
### Port: 9996
### In Use: Yes
backend mgslogging_backend
mode tcp
balance roundrobin
option tcpka
# option tcp-check
retries 2
server tng2087 tng2087:9996 check maxconn 1024
### sqljob_backend
### Port: 9986
### In Use: Yes
backend sqljob_backend
mode tcp
balance roundrobin
option tcpka
option srvtcpka
# option tcp-check
retries 2
server tng2087 tng2087:9986 check maxconn 2048
### cert_info
### Port: 9985
### mode http
### In Use: Yes
backend cert_info_http_backend
mode http
option http-server-close
balance roundrobin
option forwardfor
server tng1286 tng1286.:9985 check maxconn 128
### cluster_info
### Port: 9986
### mode http
### In Use: Yes
backend cluster_info_http_backend
mode http
balance roundrobin
option http-server-close
option forwardfor
server tng1286 tng1286.:9986 check maxconn 256
### ambari_info
### Port: 9987
### mode http
### In Use: Yes
backend ambari_info_http_backend
mode http
option http-server-close
balance roundrobin
option forwardfor
server tng1286 tng1286.:9987 check maxconn 256
### site_json_backend
### Port: 9980
### In Use: No
#backend site_json_backend
# mode tcp
# balance roundrobin
# option tcpka
# retries 2
# server tng2087 10.62.233.69:9980 check maxconn 1024
### wineventlog
### Port: 9982
### In Use: No
#backend wineventlog_backend
# mode tcp
# balance roundrobin
# option tcpka
# option tcp-check
# retries 2
# balance roundrobin
# server tng2087 10.62.233.69:9982 check maxconn 1024
# server tng2089 tng2089.:9982 check maxconn 1024
### wineventlog beat
### Port: 9982
### In Use: Yes
backend wineventlog_beat_backend
mode tcp
balance roundrobin
option tcpka
# option tcp-check
retries 2
balance roundrobin
server tng2089 tng2089.:9982 check maxconn 2048
server tng2088 tng2088.:9982 check maxconn 2048
#backend metricsbeat_backend
# mode tcp
# balance roundrobin
# option tcpka
# option tcp-check
# retries 2
# balance roundrobin
# server tng2089 tng2089.:9983 check maxconn 4096
### ELK Beats related
### Port: 9983
### In Use: No
#backend beats_backend
# mode tcp
# balance roundrobin
# option tcpka
# retries 2
# server tng2087 10.62.233.69:9983 check maxconn 1024
### DECAF
### Port: 9991
### In Use: Yes
backend decaf_backend
mode tcp
balance roundrobin
option tcpka
retries 2
server tng1286 10.62.232.108:9991 check maxconn 1024
### Netuitive
### Port: 9993
### In Use: Yes
backend netuitive_backend
mode tcp
balance roundrobin
# option tcpka
retries 2
server tng2087 10.62.233.80:9993 check maxconn 128
### IISlogs
### Port: 9990
### In Use: Yes
backend iislogs_backend
mode tcp
balance roundrobin
option tcpka
retries 2
server tng1806 10.62.233.38:9990 check maxconn 200
server tng1850 10.62.233.45:9990 check maxconn 200
server tng1851 10.62.233.46:9990 check maxconn 200
### Site IISlog
### Port: 9987
### In Use: No
backend tcp_iislogs_backend
mode tcp
balance roundrobin
option tcpka
retries 2
# server tng2087 10.62.233.69:9987 check maxconn 1024
### syslog
### Port: 9981
### In Use: Yes
backend syslog_backend
mode tcp
balance roundrobin
option tcpka
retries 2
# server tng2087 10.62.233.69:9981 check maxconn 1024
### Site http tcp
### Port: 9984
### In Use: Yes
backend http_backend
mode tcp
balance roundrobin
option tcpka
retries 2
# server tng2087 10.62.233.69:9981 check maxconn 1024
### default metrics backend
### Port: 9994
### In Use: Yes
backend default_metrics_backend
mode tcp
balance roundrobin
option tcpka
retries 2
server tng2089 10.62.233.82:9994 check maxconn 1024
### ELK Production Write Endpoint
listen elasticsearch-production-tcp-9200
mode tcp
bind *:9200
option tcpka
# option tcplog
# option tcp-check
balance roundrobin
fullconn 600
server tng2575 10.62.2.15:9200 check maxconn 512
server tng2576 10.62.2.16:9200 check maxconn 512
server tng2577 10.62.2.17:9200 check maxconn 512
### ELK Production Read Endpoint
listen elasticsearch-production-tcp-9203
mode tcp
bind *:9203
option tcpka
# option tcplog
# option tcp-check
balance roundrobin
server tng2593 10.62.2.7:9200 check maxconn 200
server tng2594 10.62.2.24:9200 check maxconn 200
listen elasticsearch-staging-tcp-9201
mode tcp
bind *:9201
option tcpka
# option tcplog
# option tcp-check
balance roundrobin
server tng1863 10.62.0.235:9200 check maxconn 100
server tng1867 10.62.0.236:9200 check maxconn 100
server tng1868 10.62.0.251:9200 check maxconn 100
show info
Name: HAProxy
Version: 1.9.12
Release_date: 2019/10/24
Nbthread: 1
Nbproc: 1
Process_num: 1
Pid: 11330
Uptime: 0d 2h17m35s
Uptime_sec: 8255
Memmax_MB: 0
PoolAlloc_MB: 1867
PoolUsed_MB: 1847
PoolFailed: 0
Ulimit-n: 16067
Maxsock: 16067
Maxconn: 8000
Hard_maxconn: 8000
CurrConns: 2783
CumConns: 2478322
CumReq: 2478459
MaxSslConns: 0
CurrSslConns: 0
CumSslConns: 0
Maxpipes: 0
PipesUsed: 0
PipesFree: 0
ConnRate: 339
ConnRateLimit: 0
MaxConnRate: 1560
SessRate: 339
SessRateLimit: 0
MaxSessRate: 1560
SslRate: 0
SslRateLimit: 0
MaxSslRate: 0
SslFrontendKeyRate: 0
SslFrontendMaxKeyRate: 0
SslFrontendSessionReuse_pct: 0
SslBackendKeyRate: 0
SslBackendMaxKeyRate: 0
SslCacheLookups: 0
SslCacheMisses: 0
CompressBpsIn: 0
CompressBpsOut: 0
CompressBpsRateLim: 0
ZlibMemUsage: 0
MaxZlibMemUsage: 0
Tasks: 2858
Run_queue: 1
Idle_pct: 95
node: tng2030
Stopping: 0
Jobs: 2797
Unstoppable Jobs: 0
Listeners: 13
ActivePeers: 0
ConnectedPeers: 0
DroppedLogs: 0
BusyPolling: 0
OOM message
Nov 5 08:54:16 tng2030 kernel: keepalived invoked oom-killer: gfp_mask=0x200da, order=0, oom_score_adj=0
Nov 5 08:54:16 tng2030 kernel: keepalived cpuset=/ mems_allowed=0
Nov 5 08:54:16 tng2030 kernel: CPU: 0 PID: 1128 Comm: keepalived Not tainted 3.10.0-327.13.1.el7.x86_64 #1
Nov 5 08:54:16 tng2030 kernel: Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/05/2016
Nov 5 08:54:16 tng2030 kernel: ffff880427af6780 00000000e4284760 ffff8800361e7a68 ffffffff816356f4
Nov 5 08:54:16 tng2030 kernel: ffff8800361e7af8 ffffffff8163068f ffff880036070440 ffff880036070458
Nov 5 08:54:16 tng2030 kernel: 0000000000000206 ffff880427af6780 ffff8800361e7ae0 ffffffff8112893f
Nov 5 08:54:16 tng2030 kernel: Call Trace:
Nov 5 08:54:16 tng2030 kernel: [<ffffffff816356f4>] dump_stack+0x19/0x1b
Nov 5 08:54:16 tng2030 kernel: [<ffffffff8163068f>] dump_header+0x8e/0x214
Nov 5 08:54:16 tng2030 kernel: [<ffffffff8112893f>] ? delayacct_end+0x8f/0xb0
Nov 5 08:54:16 tng2030 kernel: [<ffffffff8116ce7e>] oom_kill_process+0x24e/0x3b0
Nov 5 08:54:16 tng2030 kernel: [<ffffffff8116c9e6>] ? find_lock_task_mm+0x56/0xc0
Nov 5 08:54:16 tng2030 kernel: [<ffffffff81088d8e>] ? has_capability_noaudit+0x1e/0x30
Nov 5 08:54:16 tng2030 kernel: [<ffffffff8116d6a6>] out_of_memory+0x4b6/0x4f0
Nov 5 08:54:16 tng2030 kernel: [<ffffffff81173885>] __alloc_pages_nodemask+0xa95/0xb90
Nov 5 08:54:16 tng2030 kernel: [<ffffffff811b792a>] alloc_pages_vma+0x9a/0x140
Nov 5 08:54:16 tng2030 kernel: [<ffffffff81194915>] do_wp_page+0xd5/0x800
Nov 5 08:54:16 tng2030 kernel: [<ffffffff8119719c>] handle_mm_fault+0x65c/0xf50
Nov 5 08:54:16 tng2030 kernel: [<ffffffff81641380>] __do_page_fault+0x150/0x450
Nov 5 08:54:16 tng2030 kernel: [<ffffffff816416a3>] do_page_fault+0x23/0x80
Nov 5 08:54:16 tng2030 kernel: [<ffffffff8163d908>] page_fault+0x28/0x30
Nov 5 08:54:16 tng2030 kernel: Mem-Info:
Nov 5 08:54:16 tng2030 kernel: Node 0 DMA per-cpu:
Nov 5 08:54:16 tng2030 kernel: CPU 0: hi: 0, btch: 1 usd: 0
Nov 5 08:54:16 tng2030 kernel: CPU 1: hi: 0, btch: 1 usd: 0
Nov 5 08:54:16 tng2030 kernel: Node 0 DMA32 per-cpu:
Nov 5 08:54:16 tng2030 kernel: CPU 0: hi: 186, btch: 31 usd: 182
Nov 5 08:54:16 tng2030 kernel: CPU 1: hi: 186, btch: 31 usd: 31
Nov 5 08:54:16 tng2030 kernel: Node 0 Normal per-cpu:
Nov 5 08:54:16 tng2030 kernel: CPU 0: hi: 186, btch: 31 usd: 0
Nov 5 08:54:16 tng2030 kernel: CPU 1: hi: 186, btch: 31 usd: 0
Nov 5 08:54:16 tng2030 kernel: active_anon:5330700 inactive_anon:502607 isolated_anon:0#012 active_file:0 inactive_file:250 isolated_file:0#012 unevictable:40 dirty:0 writeback:0 unstable:0#012 free:28858 slab_reclaimable:6581 slab_unreclaimable:9626#012 mapped:15649 shmem:70152 pagetables:205215 bounce:0#012 free_cma:0
Nov 5 08:54:16 tng2030 kernel: Node 0 DMA free:15860kB min:12kB low:12kB high:16kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15992kB managed:15908kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:0kB slab_unreclaimable:16kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes
Nov 5 08:54:16 tng2030 kernel: lowmem_reserve[]: 0 2817 24056 24056
Nov 5 08:54:16 tng2030 kernel: Node 0 DMA32 free:87180kB min:2320kB low:2900kB high:3480kB active_anon:2096816kB inactive_anon:527352kB active_file:0kB inactive_file:1000kB unevictable:12kB isolated(anon):0kB isolated(file):0kB present:3129216kB managed:2884844kB mlocked:12kB dirty:0kB writeback:0kB mapped:27088kB shmem:73816kB slab_reclaimable:3016kB slab_unreclaimable:4636kB kernel_stack:624kB pagetables:129544kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:1438 all_unreclaimable? yes
Nov 5 08:54:16 tng2030 kernel: lowmem_reserve[]: 0 0 21239 21239
Nov 5 08:54:16 tng2030 kernel: Node 0 Normal free:12392kB min:17508kB low:21884kB high:26260kB active_anon:19225984kB inactive_anon:1483076kB active_file:0kB inactive_file:0kB unevictable:148kB isolated(anon):0kB isolated(file):0kB present:22020096kB managed:21749064kB mlocked:148kB dirty:0kB writeback:0kB mapped:35508kB shmem:206792kB slab_reclaimable:23308kB slab_unreclaimable:33852kB kernel_stack:5584kB pagetables:691316kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes
Nov 5 08:54:16 tng2030 kernel: lowmem_reserve[]: 0 0 0 0
Nov 5 08:54:16 tng2030 kernel: Node 0 DMA: 1*4kB (U) 0*8kB 1*16kB (U) 1*32kB (U) 1*64kB (U) 1*128kB (U) 1*256kB (U) 0*512kB 1*1024kB (U) 1*2048kB (R) 3*4096kB (M) = 15860kB
Nov 5 08:54:16 tng2030 kernel: Node 0 DMA32: 12*4kB (UEM) 25*8kB (EM) 30*16kB (UEM) 44*32kB (EM) 40*64kB (EM) 24*128kB (EM) 10*256kB (EM) 10*512kB (UEM) 60*1024kB (EM) 5*2048kB (M) 0*4096kB = 87128kB
Nov 5 08:54:16 tng2030 kernel: Node 0 Normal: 1050*4kB (UEM) 324*8kB (UEM) 72*16kB (UEM) 29*32kB (UEM) 27*64kB (UEM) 12*128kB (UEM) 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 12136kB
Nov 5 08:54:16 tng2030 kernel: Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
Nov 5 08:54:16 tng2030 kernel: 179994 total pagecache pages
Nov 5 08:54:16 tng2030 kernel: 109581 pages in swap cache
Nov 5 08:54:16 tng2030 kernel: Swap cache stats: add 14172428, delete 14062847, find 8923856/8941358
Nov 5 08:54:16 tng2030 kernel: Free swap = 0kB
Nov 5 08:54:16 tng2030 kernel: Total swap = 2097148kB
Nov 5 08:54:16 tng2030 kernel: 6291326 pages RAM
Nov 5 08:54:16 tng2030 kernel: 0 pages HighMem/MovableOnly
Nov 5 08:54:16 tng2030 kernel: 128872 pages reserved
Nov 5 08:54:16 tng2030 kernel: Out of memory: Kill process 31022 (haproxy) score 904 or sacrifice child
Nov 5 08:54:16 tng2030 kernel: Killed process 31022 (haproxy) total-vm:417401584kB, anon-rss:22588176kB, file-rss:0kB