Hello !
I’m trying to fix an issue i’ve got since 20 days, which makes me crazy
Here is my configuration :
I’ve got 2 Haproxy version 2.0.29 on debian 9 stretch.
We have backends behind them with Nginx v1.18.0 on Debian 11 bullseye with PHP8.1-FPM on them.
We randomly have 502 Bad Gateway with “SH” termination state, by wave (sometimes more than 26K hits). So our websites are down until refresh…
This has a huge impact for us.
We tried a lot of fixes, but none worked yet.
Here is our configuration from haproxy.cfg (I anonymized it):
global
log /dev/log local0
log /dev/log local1 notice
chroot /var/lib/haproxy
user haproxy
group haproxy
daemon
tune.bufsize 32768
# Stats socket
stats socket /var/run/haproxy.sock,<PUBLIC_IP:PORT> mode 660 level admin expose-fd listeners
stats timeout 30s
stats maxconn 3
node <NODE_NAME>
# Tuning configuration
maxconn 50000
maxcompcpuusage 80
# Default SSL material locations
ca-base /etc/ssl/certs
crt-base /etc/ssl/private
# Default ciphers to use on SSL-enabled listening sockets.
# See: https://ssl-config.mozilla.org/#server=haproxy&server-version=2.0.3&config=intermediate
ssl-default-bind-ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES128-SHA256:ECDHE-RSA-AES128-SHA256:ECDHE-ECDSA-AES128-SHA:ECDHE-RSA-AES256-SHA384:ECDHE-RSA-AES128-SHA:ECDHE-ECDSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA:ECDHE-RSA-AES256-SHA:AES128-GCM-SHA256:AES256-GCM-SHA384:AES128-SHA256:AES256-SHA256:AES128-SHA:AES256-SHA
ssl-default-bind-options ssl-min-ver TLSv1.2 no-tls-tickets
ssl-default-server-options ssl-min-ver TLSv1.2 no-tls-tickets
ssl-mode-async
defaults
log global
retries 3
maxconn 25000
mode http
balance roundrobin
option httplog
option dontlognull
option http-server-close
#timeout http-keep-alive 10s
option splice-auto
timeout connect 5s
timeout client 20m
timeout server 20m
timeout queue 3s
# DoS protection - Slowloris
timeout http-request 15m
errorfile 400 /etc/haproxy/errors/400.http
errorfile 403 /etc/haproxy/errors/403.http
errorfile 408 /etc/haproxy/errors/408.http
errorfile 500 /etc/haproxy/errors/500.http
errorfile 502 /etc/haproxy/errors/502.http
errorfile 503 /etc/haproxy/errors/503.http
errorfile 504 /etc/haproxy/errors/504.http
peers proxy
peer <PROXY1_FQDN> <PROXY1_PUBLIC_IP:PORT>
peer <PROXY2_FQDN> <PROXY2_PUBLIC_IP:PORT>
frontend web
log /dev/log local0 debug
bind <PUBLIC_IP WITH PORT>
# Dos protection + client IP persistence
stick-table type ip size 100m expire 15m peers proxy store conn_cur,conn_rate(3s),http_req_rate(10s),http_err_rate(10s)
tcp-request connection accept if .... <ALL OUR INTERNAL IPS>
tcp-request connection track-sc1 src
tcp-request connection reject if { src_conn_cur gt 500 }
tcp-request connection reject if { src_conn_rate gt 500 }
tcp-request connection reject if { src_http_req_rate gt 1000 }
tcp-request connection reject if { src_http_err_rate gt 100 }
http-request del-header X-Forwarded-For
http-request del-header X-Host
option forwardfor
# Routing based on socket.io protocol header
acl is_socketio path_beg /socket.io/
acl hdr_connection_upgrade hdr(Connection) -i upgrade
acl hdr_upgrade_websocket hdr(Upgrade) -i websocket
use_backend socketio if is_socketio || hdr_connection_upgrade hdr_upgrade_websocket
default_backend http
backend http
stick on src table web unless { hdr(<SPECIFIC HEADER>) -m found }
option httpchk GET /up.html HTTP/1.1\r\nHost:\ <SPECIFIC URL>
use-server <ONE OF OUR WEB SERVER> if { hdr(<SPECIFIC HEADER>) -i <THE WEB SERVER> } #1 line like that per web server
default-server check maxconn 1000
server <ONE OF OUR WEB SERVER> <ITS_IP>:80 #1 line like that per web server
backend socketio
stick-table type string len 8 size 100m expire 5h peers proxy
stick on url_param(remote) table socketio
balance leastconn
no option httpclose
server <ONE OF OUR WEB SERVER> <ITS_IP>:57570 check maxconn 2000 #1 line like that per web server
backend limits
stick-table type binary len 20 size 100m expire 10m store http_req_rate(60s)
frontend ssl
log /dev/log local0
bind <PROXY1_PUBLIC_IPS> ssl crt /<LIST OF CERTS>
# Dos protection + client IP persistence
tcp-request connection accept if { src < INTERNAL IPS>}
tcp-request connection track-sc1 src table web
tcp-request connection reject if { src_conn_cur(web) gt 500 }
tcp-request connection reject if { src_conn_rate(web) gt 500 }
tcp-request connection reject if { src_http_req_rate(web) gt 1000 }
tcp-request connection reject if { src_http_err_rate(web) gt 100 }
http-request track-sc0 base32+src table limits
http-request set-var(req.rate_limit) path,map_beg(/etc/haproxy/rates.map)
http-request set-var(req.request_rate) base32+src,table_http_req_rate(limits)
acl rate_abuse var(req.rate_limit),sub(req.request_rate) lt 0
http-request deny deny_status 429 if rate_abuse
# Routing based on socket.io protocol header
acl is_socketio path_beg /socket.io/
acl hdr_connection_upgrade hdr(Connection) -i upgrade
acl hdr_upgrade_websocket hdr(Upgrade) -i websocket
use_backend socketio if is_socketio || hdr_connection_upgrade hdr_upgrade_websocket
default_backend ssl
backend ssl
#stick-table type ip size 100m expire 15m peers proxy store conn_cur,conn_rate(3s),http_req_rate(10s),http_err_rate(10s)
stick on src table web unless { hdr(<SPECIFIC HEADER>) -m found }
http-request del-header X-Forwarded-For
http-request del-header X-Host
option forwardfor
option httpchk GET /up.html HTTP/1.1\r\nHost:\ <SPECIFIC URL>
# HSTS
http-response set-header Strict-Transport-Security "max-age=31536000; includeSubDomains"
use-server <ONE OF OUR WEB SERVER> if { hdr(<SPECIFIC HEADER>) -i <THE WEB SERVER> }
force-persist if { hdr(SPECIFIC HEADER) -m found }
default-server maxconn 5000 ssl verify none check check-ssl check-alpn http/1.1 alpn h2
server server <ONE OF OUR WEB SERVER> <ITS_IP>:443
listen stats
bind <PROXY1_PUBLIC IP> ssl crt /<PROXY1_CERT> alpn h2,http/1.1
stats enable
stats admin if TRUE
stats realm Haproxy\ Statistics
stats uri /
stats auth <AUTH_CREDENTIALS>
stats show-node
stats show-legends
frontend prometheus_metrics
bind 127.0.0.1:8404
option http-use-htx
http-request use-service prometheus-exporter if { path /metrics }
And so, randomly, we have 502 Bad gateway with “SH–” as termination state.
I know that this kind of error means that the the server aborted before sending its full HTTP response headers, or it crashed while processing the request, but we have no specific crash on our nginx servers or on haproxy servers. They can communicate between them (we have deep monitoring on each side).
I also noticed on the nginx side that when we have these 502 error waves, our proxy send a strange POST request like this one (from our nginx logs) :
<PROXY1_PUBLIC IP> - <PROXY1_PUBLIC IP> - - [09/Nov/2022:09:26:11 +0100] "POST https://<WEBSERVER_FQDN>- HTTP/2.0" 000 0 "-" "-"
As you can see the POST request ends with a hyphen.
And we have a lot, lot lot requests like this one (exactly this one).
Our applications don’t send this kind of request, and as you can see it comes from our proxy.
Does anyone know why we have this behavior ?
We tried to increase the bufsize parameter, remove the http-server-close, and many more but nothing actually works. We can’t reproduce it voluntarily, it’s not regluar, we don’t execute any batch or script just before these 502 errors appear.
Many thanks for your time !