Randomly get 502 errors

Hello !
I’m trying to fix an issue i’ve got since 20 days, which makes me crazy :sweat_smile:

Here is my configuration :
I’ve got 2 Haproxy version 2.0.29 on debian 9 stretch.
We have backends behind them with Nginx v1.18.0 on Debian 11 bullseye with PHP8.1-FPM on them.
We randomly have 502 Bad Gateway with “SH” termination state, by wave (sometimes more than 26K hits). So our websites are down until refresh…
This has a huge impact for us.
We tried a lot of fixes, but none worked yet.

Here is our configuration from haproxy.cfg (I anonymized it):

global
        log /dev/log    local0
        log /dev/log local1 notice
        chroot /var/lib/haproxy
        user haproxy
        group haproxy
        daemon
        tune.bufsize 32768
        # Stats socket
        stats socket /var/run/haproxy.sock,<PUBLIC_IP:PORT> mode 660 level admin expose-fd listeners
        stats timeout 30s
        stats maxconn 3
        node <NODE_NAME>

        # Tuning configuration
        maxconn 50000
        maxcompcpuusage 80

        # Default SSL material locations
        ca-base /etc/ssl/certs
        crt-base /etc/ssl/private

        # Default ciphers to use on SSL-enabled listening sockets.
        # See: https://ssl-config.mozilla.org/#server=haproxy&server-version=2.0.3&config=intermediate
        ssl-default-bind-ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES128-SHA256:ECDHE-RSA-AES128-SHA256:ECDHE-ECDSA-AES128-SHA:ECDHE-RSA-AES256-SHA384:ECDHE-RSA-AES128-SHA:ECDHE-ECDSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA:ECDHE-RSA-AES256-SHA:AES128-GCM-SHA256:AES256-GCM-SHA384:AES128-SHA256:AES256-SHA256:AES128-SHA:AES256-SHA
        ssl-default-bind-options ssl-min-ver TLSv1.2 no-tls-tickets
        ssl-default-server-options ssl-min-ver TLSv1.2 no-tls-tickets
        ssl-mode-async

defaults
        log     global
        retries 3
        maxconn 25000
        mode    http
        balance roundrobin
        option  httplog
        option  dontlognull
        option http-server-close
        #timeout http-keep-alive 10s
        option  splice-auto
        timeout connect 5s
        timeout client  20m
        timeout server  20m
        timeout queue   3s
        # DoS protection - Slowloris
        timeout http-request 15m
        errorfile 400 /etc/haproxy/errors/400.http
        errorfile 403 /etc/haproxy/errors/403.http
        errorfile 408 /etc/haproxy/errors/408.http
        errorfile 500 /etc/haproxy/errors/500.http
        errorfile 502 /etc/haproxy/errors/502.http
        errorfile 503 /etc/haproxy/errors/503.http
        errorfile 504 /etc/haproxy/errors/504.http

peers proxy
        peer <PROXY1_FQDN> <PROXY1_PUBLIC_IP:PORT>
        peer <PROXY2_FQDN> <PROXY2_PUBLIC_IP:PORT>
frontend web
        log /dev/log  local0 debug
        bind <PUBLIC_IP WITH PORT>
        # Dos protection + client IP persistence
        stick-table type ip size 100m expire 15m peers proxy store conn_cur,conn_rate(3s),http_req_rate(10s),http_err_rate(10s)
        tcp-request connection accept if .... <ALL OUR INTERNAL IPS>
        tcp-request connection track-sc1 src
        tcp-request connection reject if { src_conn_cur gt 500 }
        tcp-request connection reject if { src_conn_rate gt 500 }
        tcp-request connection reject if { src_http_req_rate gt 1000 }
        tcp-request connection reject if { src_http_err_rate gt 100 }

        http-request del-header X-Forwarded-For
        http-request del-header X-Host
        option forwardfor

        # Routing based on socket.io protocol header
        acl is_socketio path_beg /socket.io/
        acl hdr_connection_upgrade hdr(Connection)  -i upgrade
        acl hdr_upgrade_websocket  hdr(Upgrade)     -i websocket

        use_backend socketio if is_socketio || hdr_connection_upgrade hdr_upgrade_websocket
        default_backend http

backend http
        stick on src table web unless { hdr(<SPECIFIC HEADER>) -m found }

        option httpchk GET /up.html HTTP/1.1\r\nHost:\ <SPECIFIC URL>

        use-server <ONE OF OUR WEB SERVER> if { hdr(<SPECIFIC HEADER>) -i <THE WEB SERVER> } #1 line like that per web server

        default-server check maxconn 1000
        server <ONE OF OUR WEB SERVER> <ITS_IP>:80 #1 line like that per web server

backend socketio
        stick-table type string len 8 size 100m expire 5h peers proxy
        stick on url_param(remote) table socketio
        
        balance leastconn
        no option httpclose

        server <ONE OF OUR WEB SERVER> <ITS_IP>:57570 check maxconn 2000  #1 line like that per web server

backend limits
        stick-table  type binary  len 20  size 100m  expire 10m  store http_req_rate(60s)

frontend ssl
        log /dev/log  local0 
        bind <PROXY1_PUBLIC_IPS> ssl crt /<LIST OF CERTS>

        # Dos protection + client IP persistence
        tcp-request connection accept if { src < INTERNAL IPS>} 
        tcp-request connection track-sc1 src table web
        tcp-request connection reject if { src_conn_cur(web) gt 500 }
        tcp-request connection reject if { src_conn_rate(web) gt 500 }
        tcp-request connection reject if { src_http_req_rate(web) gt 1000 }
        tcp-request connection reject if { src_http_err_rate(web) gt 100 }

        http-request track-sc0 base32+src table limits
        http-request set-var(req.rate_limit)  path,map_beg(/etc/haproxy/rates.map)
        http-request set-var(req.request_rate)  base32+src,table_http_req_rate(limits)
        acl rate_abuse var(req.rate_limit),sub(req.request_rate) lt 0
        http-request deny deny_status 429 if rate_abuse

        # Routing based on socket.io protocol header
        acl is_socketio path_beg /socket.io/
        acl hdr_connection_upgrade hdr(Connection)  -i upgrade
        acl hdr_upgrade_websocket  hdr(Upgrade)     -i websocket

        use_backend socketio if is_socketio || hdr_connection_upgrade hdr_upgrade_websocket
        default_backend ssl

backend ssl
        #stick-table type ip size 100m expire 15m peers proxy store conn_cur,conn_rate(3s),http_req_rate(10s),http_err_rate(10s)
        stick on src table web unless { hdr(<SPECIFIC HEADER>) -m found }
        
        http-request del-header X-Forwarded-For
        http-request del-header X-Host
        option forwardfor
        option httpchk GET /up.html HTTP/1.1\r\nHost:\ <SPECIFIC URL>
        # HSTS
        http-response set-header Strict-Transport-Security "max-age=31536000; includeSubDomains"

        use-server <ONE OF OUR WEB SERVER> if { hdr(<SPECIFIC HEADER>) -i <THE WEB SERVER> }
        force-persist if { hdr(SPECIFIC HEADER) -m found }

        default-server maxconn 5000 ssl verify none check check-ssl check-alpn http/1.1 alpn h2
        server server <ONE OF OUR WEB SERVER> <ITS_IP>:443

listen stats
        bind <PROXY1_PUBLIC IP> ssl crt /<PROXY1_CERT> alpn h2,http/1.1
        stats enable
        stats admin if TRUE
        stats realm Haproxy\ Statistics
        stats uri /
        stats auth <AUTH_CREDENTIALS>
        stats show-node
        stats show-legends

frontend prometheus_metrics
   bind 127.0.0.1:8404
   option http-use-htx
   http-request use-service prometheus-exporter if { path /metrics }

And so, randomly, we have 502 Bad gateway with “SH–” as termination state.
I know that this kind of error means that the the server aborted before sending its full HTTP response headers, or it crashed while processing the request, but we have no specific crash on our nginx servers or on haproxy servers. They can communicate between them (we have deep monitoring on each side).
I also noticed on the nginx side that when we have these 502 error waves, our proxy send a strange POST request like this one (from our nginx logs) :

<PROXY1_PUBLIC IP> - <PROXY1_PUBLIC IP> - - [09/Nov/2022:09:26:11 +0100] "POST https://<WEBSERVER_FQDN>- HTTP/2.0" 000 0 "-" "-"

As you can see the POST request ends with a hyphen.
And we have a lot, lot lot requests like this one (exactly this one).

Our applications don’t send this kind of request, and as you can see it comes from our proxy.

Does anyone know why we have this behavior ?
We tried to increase the bufsize parameter, remove the http-server-close, and many more but nothing actually works. We can’t reproduce it voluntarily, it’s not regluar, we don’t execute any batch or script just before these 502 errors appear.
Many thanks for your time !

It appears we had an issue with HTTP2.0 headers, by updating our nginx servers to the latest available version we don’t have 502 errors anymore.