HAProxy marks server as down while not being down - inexplicable healthcheck timeouts

Hello,
I’m using HA-Proxy version 2.0.13-1~bpo10+1 2020/02/15 on Debian 10.

We have configured the following setup:

global 
	daemon
	chroot /var/lib/haproxy
	user haproxy
	group haproxy
	master-worker

	stats socket /run/haproxy/admin.sock user haproxy group haproxy mode 660 level admin expose-fd listeners
	stats timeout 10s

	# Default SSL material locations
	ca-base /etc/ssl/certs
	crt-base /etc/ssl/private

	tune.ssl.default-dh-param 2048
	ssl-default-bind-options no-sslv3 no-tlsv10 no-tlsv11 no-tls-tickets
	ssl-default-bind-ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384

	# See: https://ssl-config.mozilla.org/#server=haproxy&server-version=2.0.3&config=intermediate
	# intermediate configuration, tweak to your needs
	ssl-default-bind-ciphersuites TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256
	ssl-default-server-ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384
	ssl-default-server-ciphersuites TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256
	ssl-default-server-options no-sslv3 no-tlsv10 no-tlsv11 no-tls-tickets

	ssl-dh-param-file /etc/haproxy/dhparam.pem
	tune.ssl.cachesize 80000
	tune.ssl.lifetime 900

	log /dev/log len 4096 local0
	log /dev/log len 4096 local1 notice

defaults 
	mode http
	option http-server-close #Setting "option http-server-close" enables HTTP connection-close mode on the server side while keeping the ability to support HTTP keep-alive and pipelining on the client side. This provides the lowest latency on the client side (slow network) and the fastest session reuse on the server side to save server resources
	option forwardfor #Enable insertion of the X-Forwarded-For header to requests sent to servers
	timeout http-keep-alive 15s #Set the maximum allowed time to wait for a new HTTP request to appear
	option redispatch #if a server designated by a cookie is down, clients may definitely stick to it because they cannot flush the cookie, so they will not be able to access the service anymore. Specifying "option redispatch" will allow the proxy to break cookie
	option http-buffer-request # enable HTTP body logging, needed for body param extraction
	retries 3 # per default start 3 retries if server isn't reacting

	log global
	log-format '{"host":"%H","ident":"haproxy-v2","pid":%pid,"time":"%Tl","customerId":"%[capture.req.hdr(4)]","haproxy":{"conn":{"actconn":%ac,"feconn":%fc,"beconn":%bc,"srv_conn":%sc},"queue":{"backend":%bq,"srv":%sq},"time":{"Tq":%Tq,"Tw":%Tw,"Tc":%Tc,"Tr":%Tr,"Tt":%Tt,"Ta":%Ta},"termination_state":"%tsc","retries":%rc,"proxying":{"client_ip":"%ci","client_port":%cp,"frontend_ip":"%fi","frontend_port":%fp,"backend_name":"%b","frontend_name":"%ft","server_name":"%s","server_ip":"%si","server_port":"%sp"},"ssl":{"version":"%sslv","ciphers":"%sslc"},"request":{"method":"%HM","uri":"%HU","uri-query-string":"%HQ","uri-path":"%HP","protocol":"%HV"},"req-headers":{"user-agent":"%[capture.req.hdr(0)]","referrer":"%[capture.req.hdr(1)]","content-length":"%[capture.req.hdr(2)]","requested-host":"%[capture.req.hdr(3)]"},"response":{"status_code":%ST},"bytes":{"uploaded":%U,"read":%B}}}'

	no option log-separate-errors # we need the errors not separately as we have it in ELK stack
	option log-health-checks # we want the haproxy checks logged as well
	no option logasap # disabled as we want to see all time it needs to fulfill the request
	no option dontlognull # disabled as we want to see normal requests as well

	timeout connect 20s
	timeout queue 30s
	timeout client 60s
	timeout server 60s
	timeout client-fin 30s
	timeout http-request 20s
	timeout http-keep-alive 75s
	timeout tarpit 15s

	errorfile 400 /etc/haproxy/errors/400.http
	errorfile 403 /etc/haproxy/errors/403.http
	errorfile 408 /etc/haproxy/errors/408.http
	errorfile 500 /etc/haproxy/errors/500.http
	errorfile 502 /etc/haproxy/errors/502.http
	errorfile 503 /etc/haproxy/errors/503.http
	errorfile 504 /etc/haproxy/errors/504.http



### DEFINE FRONTENDS
  # this is our general frontend
frontend http-https-in 
	mode http
	bind <haproxy-server>:443 ssl crt /etc/haproxy/certs/10.2018..com.pem crt /etc/haproxy/certs/.pem crt /etc/haproxy/certs/.com.pem
	bind <haproxy-server>:80




	# CAPTURE DATA FROM REQUESTS FOR LOGGING
	capture request header User-Agent len 256 # for user agent
	capture request header Referer len 256 # for referrer
	capture request header Content-Length len 10 # for content length of POST (does this also work for PUT?)
	capture request header Host len 128 # for the host, subdomain, etc
	http-request capture req.body_param(customerId) len 4

	# HSTS (63072000 seconds)

	# tell the haproxy backends and all services beyond that this is https/http
	http-request add-header X-Forwarded-Proto https if { ssl_fc }
	http-request add-header X-Forwarded-Proto http if !{ ssl_fc }

	# redirect all non-https traffic
	http-response set-header Strict-Transport-Security max-age=63072000
	http-request redirect scheme https if !{ ssl_fc }


# DEFINE ACLS
...

# DEFINE ACL TO BACKEND
....



# DEFINE BACKENDS
backend backend01
	balance roundrobin
	option httpchk GET /healthcheck
	http-check expect rstatus (2|3)[0-9][0-9]

	server server1 <server-1-IP>:8585  check inter 10s fastinter 5s fall 2 rise 1 observe layer7 on-error mark-down error-limit 10 pool-purge-delay 30s
	server server2 <server-2-IP>:8585  check inter 10s fastinter 5s fall 2 rise 1 observe layer7 on-error mark-down error-limit 10 pool-purge-delay 30s backup

The problem we’re facing are DOWNs and UPs for the configured server1 and server2 without them being really down. I see in the haproxy logs entries like:

Mar 19 21:10:08 haproxy-server haproxy[3203]: [WARNING] 078/211008 (3204) : Health check for server backend01/server1 failed, reason: Layer7 timeout, check duration: 10000ms, status: 0/1 DOWN.
Mar 19 21:10:09 haproxy-server haproxy[3203]: [WARNING] 078/211008 (3204) : Server backend01/server1 is DOWN. 0 active and 1 backup servers left. Running on backup. 1 sessions active, 0 requeued, 0 remaining in queue.
Mar 19 21:10:27 haproxy-server haproxy[3203]: [WARNING] 078/211027 (3204) : Health check for server backend01/server1 succeeded, reason: Layer7 check passed, code: 200, info: "HTTP status check returned code <3C>200<3E>", check duration: 8009ms, status: 2/2 UP.
Mar 19 21:10:27 haproxy-server haproxy[3203]: [WARNING] 078/211027 (3204) : Server backend01/server1 is UP. 1 active and 1 backup servers online. 0 sessions requeued, 0 total in queue.
Mar 19 21:10:47 haproxy-server haproxy[3203]: [WARNING] 078/211047 (3204) : Health check for server backend01/server1 failed, reason: Layer7 timeout, check duration: 10001ms, status: 1/2 UP.
Mar 19 21:11:02 haproxy-server haproxy[3203]: [WARNING] 078/211102 (3204) : Health check for server backend01/server1 failed, reason: Layer7 timeout, check duration: 10000ms, status: 0/1 DOWN.
Mar 19 21:11:02 haproxy-server haproxy[3203]: [WARNING] 078/211102 (3204) : Server backend01/server1 is DOWN. 0 active and 1 backup servers left. Running on backup. 0 sessions active, 0 requeued, 0 remaining in queue.
Mar 19 21:12:17 haproxy-server haproxy[3203]: [WARNING] 078/211217 (3204) : Health check for server backend01/server1 succeeded, reason: Layer7 check passed, code: 200, info: "HTTP status check returned code <3C>200<3E>", check duration: 5006ms, status: 2/2 UP.
Mar 19 21:12:17 haproxy-server haproxy[3203]: [WARNING] 078/211217 (3204) : Server backend01/server1 is UP. 1 active and 1 backup servers online. 0 sessions requeued, 0 total in queue.
Mar 19 21:28:17 haproxy-server haproxy[3203]: [WARNING] 078/212817 (3204) : Health check for server backend01/server1 failed, reason: Layer7 timeout, check duration: 10001ms, status: 1/2 UP.
Mar 19 21:28:32 haproxy-server haproxy[3203]: [WARNING] 078/212832 (3204) : Health check for server backend01/server1 failed, reason: Layer7 timeout, check duration: 10001ms, status: 0/1 DOWN.
Mar 19 21:28:32 haproxy-server haproxy[3203]: [WARNING] 078/212832 (3204) : Server backend01/server1 is DOWN. 0 active and 1 backup servers left. Running on backup. 1 sessions active, 0 requeued, 0 remaining in queue.
Mar 19 21:28:47 haproxy-server haproxy[3203]: [WARNING] 078/212847 (3204) : Health check for server backend01/server1 succeeded, reason: Layer7 check passed, code: 200, info: "HTTP status check returned code <3C>200<3E>", check duration: 5006ms, status: 2/2 UP.
Mar 19 21:28:47 haproxy-server haproxy[3203]: [WARNING] 078/212847 (3204) : Server backend01/server1 is UP. 1 active and 1 backup servers online. 0 sessions requeued, 0 total in queue.
Mar 19 21:36:07 haproxy-server haproxy[3203]: [WARNING] 078/213607 (3204) : Health check for server backend01/server1 failed, reason: Layer7 timeout, check duration: 10001ms, status: 1/2 UP.
Mar 19 21:36:12 haproxy-server haproxy[3203]: [WARNING] 078/213612 (3204) : Health check for server backend01/server1 succeeded, reason: Layer7 check passed, code: 200, info: "HTTP status check returned code <3C>200<3E>", check duration: 1ms, status: 2/2 UP.
Mar 19 21:40:27 haproxy-server haproxy[3203]: [WARNING] 078/214027 (3204) : Health check for server backend01/server1 failed, reason: Layer7 timeout, check duration: 10000ms, status: 1/2 UP.
Mar 19 21:40:42 haproxy-server haproxy[3203]: [WARNING] 078/214042 (3204) : Health check for server backend01/server1 failed, reason: Layer7 timeout, check duration: 10001ms, status: 0/1 DOWN.
Mar 19 21:40:42 haproxy-server haproxy[3203]: [WARNING] 078/214042 (3204) : Server backend01/server1 is DOWN. 0 active and 1 backup servers left. Running on backup. 0 sessions active, 0 requeued, 0 remaining in queue.
Mar 19 21:43:20 haproxy-server haproxy[3203]: [WARNING] 078/214320 (3204) : Health check for server backend01/server1 succeeded, reason: Layer7 check passed, code: 200, info: "HTTP status check returned code <3C>200<3E>", check duration: 8006ms, status: 2/2 UP.
Mar 19 21:43:20 haproxy-server haproxy[3203]: [WARNING] 078/214320 (3204) : Server backend01/server1 is UP. 1 active and 1 backup servers online. 0 sessions requeued, 0 total in queue.
Mar 19 21:56:06 haproxy-server haproxy[3203]: [WARNING] 078/215606 (3204) : Health check for server backend01/server1 failed, reason: Layer7 timeout, check duration: 10001ms, status: 1/2 UP.
Mar 19 21:56:21 haproxy-server haproxy[3203]: [WARNING] 078/215621 (3204) : Health check for server backend01/server1 failed, reason: Layer7 timeout, check duration: 10001ms, status: 0/1 DOWN.
Mar 19 21:56:21 haproxy-server haproxy[3203]: [WARNING] 078/215621 (3204) : Server backend01/server1 is DOWN. 0 active and 1 backup servers left. Running on backup. 0 sessions active, 0 requeued, 0 remaining in queue.
Mar 19 21:58:11 haproxy-server haproxy[3203]: [WARNING] 078/215811 (3204) : Health check for server backend01/server1 succeeded, reason: Layer7 check passed, code: 200, info: "HTTP status check returned code <3C>200<3E>", check duration: 1ms, status: 2/2 UP.
Mar 19 21:58:11 haproxy-server haproxy[3203]: [WARNING] 078/215811 (3204) : Server backend01/server1 is UP. 1 active and 1 backup servers online. 0 sessions requeued, 0 total in queue.
Mar 19 22:10:09 haproxy-server haproxy[3203]: [WARNING] 078/221009 (3204) : Health check for server backend01/server1 failed, reason: Layer7 timeout, check duration: 10000ms, status: 1/2 UP.
Mar 19 22:10:24 haproxy-server haproxy[3203]: [WARNING] 078/221024 (3204) : Health check for server backend01/server1 failed, reason: Layer7 timeout, check duration: 10001ms, status: 0/1 DOWN.
Mar 19 22:10:24 haproxy-server haproxy[3203]: [WARNING] 078/221024 (3204) : Server backend01/server1 is DOWN. 0 active and 1 backup servers left. Running on backup. 0 sessions active, 0 requeued, 0 remaining in queue.
Mar 19 22:13:39 haproxy-server haproxy[3203]: [WARNING] 078/221339 (3204) : Health check for server backend01/server1 succeeded, reason: Layer7 check passed, code: 200, info: "HTTP status check returned code <3C>200<3E>", check duration: 5006ms, status: 2/2 UP.
Mar 19 22:13:39 haproxy-server haproxy[3203]: [WARNING] 078/221339 (3204) : Server backend01/server1 is UP. 1 active and 1 backup servers online. 0 sessions requeued, 0 total in queue.

But I also ran a separate script on the same machine hammering the server1 API with 5 requests/s for more than 2h and none of the ~31k requests was getting something other back than HTTP status code 200.

Did anyone experience something similar?

First thing I’d try is to completely remove observe layer7 on-error mark-down error-limit 10, to make sure that health check is not impacted by actual requests going through (maybe a client triggers a 500, or 506+ response).

If that is not the issue, then I don’t see any other way than to capture the traffic and analyze it.

Ok, I disabled the observe layer7 on-error mark-down error-limit 10 and checked the situation again for a couple of hours. Unfortunately this didn’t change anything.

As you suggested I then dumped the traffic on the haproxy server and waited until the next failing healthcheck. I found the following:

(IP x.x.x.98 is HAProxy server, IP x.x.x.144 is application server)

The corresponding HAProxy healthcheck log looks like:

Mar 20 10:41:25 haproxy-server haproxy[20922]: [WARNING] 079/104125 (20923) : Health check for server backend01/server1 failed, reason: Layer7 timeout, check duration: 10001ms, status: 1/2 UP.
Mar 20 10:41:40 haproxy-server haproxy[20922]: [WARNING] 079/104140 (20923) : Health check for server backend01/server1 failed, reason: Layer7 timeout, check duration: 10000ms, status: 0/1 DOWN.
Mar 20 10:41:40 haproxy-server haproxy[20922]: [WARNING] 079/104140 (20923) : Server backend01/server1 is DOWN. 0 active and 1 backup servers left. Running on backup. 0 sessions active, 0 requeued, 0 remaining in queue.
Mar 20 10:42:15 haproxy-server haproxy[20922]: [WARNING] 079/104215 (20923) : Health check for server backend01/server1 succeeded, reason: Layer7 check passed, code: 200, info: "HTTP status check returned code <3C>200<3E>", check duration: 5003ms, status: 2/2 UP.
Mar 20 10:42:15 haproxy-server haproxy[20922]: [WARNING] 079/104215 (20923) : Server backend01/server1 is UP. 1 active and 1 backup servers online. 0 sessions requeued, 0 total in queue.
Mar 20 10:43:35 haproxy-server haproxy[7653]: [WARNING] 079/104335 (7654) : Health check for server backend01/server1 succeeded, reason: Layer7 check passed, code: 200, info: "HTTP status check returned code <3C>200<3E>", check duration: 5003ms, status: 2/2 UP.
Mar 20 10:54:41 haproxy-server haproxy[7653]: [WARNING] 079/105441 (7654) : Health check for server backend01/server1 failed, reason: Layer7 timeout, check duration: 10001ms, status: 1/2 UP.
Mar 20 10:54:56 haproxy-server haproxy[7653]: [WARNING] 079/105456 (7654) : Health check for server backend01/server1 failed, reason: Layer7 timeout, check duration: 10001ms, status: 0/1 DOWN.
Mar 20 10:54:56 haproxy-server haproxy[7653]: [WARNING] 079/105456 (7654) : Server backend01/server1 is DOWN. 0 active and 1 backup servers left. Running on backup. 0 sessions active, 0 requeued, 0 remaining in queue.
Mar 20 10:55:46 haproxy-server haproxy[7653]: [WARNING] 079/105546 (7654) : Health check for server backend01/server1 succeeded, reason: Layer7 check passed, code: 200, info: "HTTP status check returned code <3C>200<3E>", check duration: 1ms, status: 2/2 UP.
Mar 20 10:55:46 haproxy-server haproxy[7653]: [WARNING] 079/105546 (7654) : Server backend01/server1 is UP. 1 active and 1 backup servers online. 0 sessions requeued, 0 total in queue.

I’m not a good network protocol expert but as far as I can interpret the Wireshark listing is:

  1. On 10:54:31 the haproxy starts sending a SYN
  2. App server answeres with ACK/SYN
  3. haproxy answers ACK
  4. Then haproxy sends the HTTP packet to our defined healthcheck resource at the app server API
  5. The app server send an ACK for receiving the HTTP packet
  6. …then nothing happens!
  7. The haproxy sends finally an ACK and RST packet and is terminating the connection, marking the server as DOWN

Am I right with this interpretation? In this case it must be our application server which isn’t answering correctly.

I agree with your interpretation, and that also matches what haproxy logs, which is a Layer7 timeout, meaning the connection is fine but the response is not.