Issues with Cloudflare -> Haproxy -> Varnish setup when using http-reuse always/safe

Hi, our setup consists of Cloudflare, Haproxy and Varnish.

My issue is with http-reuse always/safe. It is causing some requests to route wrongly in varnish.

Here’s my haproxy config:

global

    cpu-map 1/1- 0 1 2 3 4 5 6 7


    ssl-default-bind-curves X25519:prime256v1:secp384r1
    ssl-default-bind-options prefer-client-ciphers ssl-min-ver TLSv1.3
    ssl-default-bind-ciphersuites TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256
    tune.ssl.cachesize 512000
    tune.ssl.lifetime 86400
    maxconn 200000

    tune.h2.header-table-size 4096
    tune.h2.initial-window-size 65535
    tune.h2.max-concurrent-streams 100

# Pseudo-backends used only for statistics tracking.
backend httpreqrate
    stick-table type ipv6  size 1m expire 300s store http_req_rate(10s),gpc_rate(10,300s)
backend httpreqrate_http
    stick-table type ipv6  size 1m expire 300s store http_req_rate(10s),gpc_rate(10,300s)

listen tls
    log global
    maxconn 199000
    bind :443 tfo ssl crt-list /etc/haproxy/crt-list.cfg tls-ticket-keys /run/haproxy-secrets/stek.keys

    bind :::443 tfo v6only ssl crt-list /etc/haproxy/crt-list.cfg tls-ticket-keys /run/haproxy-secrets/stek.keys

    http-reuse never

    # time to wait for a complete HTTP request, It only applies to the header part of the HTTP request (unless option http-buffer-request is used)
    timeout http-request 3600s
    # set the maximum allowed time to wait for a new HTTP request to appear
    timeout http-keep-alive 120s
    # set the maximum inactivity time on the client side
    timeout client 120s
    # inactivity timeout on the client side for half-closed connections
    timeout client-fin 120s
    # connect timeout against a backend server
    timeout connect 3s
    # set the maximum inactivity time on the server side
    timeout server 180s
    # timeout used after upgrading a connection (websockets) or after the first response when no keepalive/close option is specified
    timeout tunnel 3600s

    unique-id-format "%rt"

    capture response header X-Cache-Status len 10
    capture request header Host len 255
    capture request header Referer len 1024
    capture request header User-Agent len 1024
    capture request header Accept-Language len 1024
    capture request header Range len 10
    capture request header Accept len 64
    capture response header Content-Type len 128
    capture response header X-Cache len 96
    capture response header Server len 64

    http-request del-header X-Real-IP

    acl is_cloudflare_ip src 173.245.48.0/20 103.21.244.0/22 103.22.200.0/22 103.31.4.0/22 141.101.64.0/18 108.162.192.0/18 190.93.240.0/20 188.114.96.0/20 197.234.240.0/22 198.41.128.0/17 162.158.0.0/15 104.16.0.0/13 104.24.0.0/14 172.64.0.0/13 131.0.72.0/22 2400:cb00::/32 2606:4700::/32 2803:f800::/32 2405:b500::/32 2405:8100::/32 2a06:98c0::/29 2c0f:f248::/32
    acl cf_ip_hdr req.hdr(CF-Connecting-IP) -m found
    http-request set-header X-Real-IP %[req.hdr(CF-Connecting-IP)] if is_cloudflare_ip cf_ip_hdr
    http-request set-header X-Real-IP %[src] if !is_cloudflare_ip

    http-request set-src hdr(CF-Connecting-IP) if is_cloudflare_ip cf_ip_hdr

    # Redirect unwanted User-Agent requests to 403
    acl is_mj12bot hdr_sub(User-Agent) -i MJ12bot
    acl is_zimit hdr_sub(User-Agent) -i +Zimit
    acl is_sqlmap hdr_sub(User-Agent) -i sqlmap
    acl is_go_http hdr_sub(User-Agent) -i Go-http-client
    acl is_anthropic hdr_sub(User-Agent) -i claudebot@anthropic.com
    acl is_amazonbot hdr_sub(User-Agent) -i Amazonbot

    http-request deny if is_mj12bot
    http-request deny if is_zimit
    http-request deny if is_sqlmap
    http-request deny if is_go_http
    http-request deny if is_anthropic
    http-request deny if is_amazonbot

    http-request set-var(txn.xwd_count) req.hdr_cnt(X-Wikitide-Debug)



    acl wikitide_trust src 2602:294:0:c8::/64 2602:294:0:b13::/64 2602:294:0:b23::/64 2602:294:0:b12::/64 2602:294:0:b33::/64 2602:294:0:b39::/64 2604:2dc0:202:300::7c6 10.0.0.0/8 38.46.223.206 38.46.223.205

    acl too_many_concurrent_queries sc0_trackers(httpreqrate) ge 500
    acl too_much_recent_concurrency sc0_gpc0_rate(httpreqrate) gt 0
    acl mark_as_too_much_concurrency sc0_inc_gpc0(httpreqrate) gt 0
    acl missing_xwd var(txn.xwd_count) -m int eq 0


    http-request del-header X-Request-Id if !wikitide_trust
    http-request del-header tracestate if !wikitide_trust
    http-request del-header traceparent if !wikitide_trust
    http-response del-header Backend-Timing if missing_xwd
    http-response del-header X-OpenStack-Request-ID if missing_xwd
    http-response del-header X-Powered-By if missing_xwd
    http-response del-header X-Request-Id if missing_xwd
    http-response del-header X-Timestamp if missing_xwd
    http-response del-header X-Trans-Id if missing_xwd
    http-response del-header X-Varnish if missing_xwd
    http-response del-header traceparent if missing_xwd
    http-response del-header tracestate if missing_xwd

    http-request track-sc0 src table httpreqrate if !wikitide_trust 
    http-request set-var(req.dummy_silent_drop) src,debug(silent-drop_for_300s,stderr) if !wikitide_trust too_many_concurrent_queries !too_much_recent_concurrency   # exists only for logging side-effect
    http-request silent-drop if too_much_recent_concurrency || !wikitide_trust too_many_concurrent_queries mark_as_too_much_concurrency   # To disable concurrency enforcement, see post_acl_actions in puppet/hieradata/common/role/cache/cache.yaml

    balance uri
    hash-type consistent

    option httpchk
    http-check send meth HEAD uri /check ver HTTP/1.1 hdr Host health.wikitide.net
    http-check expect status 200

    acl hc-host        hdr(host) -i health.wikitide.net
    acl hc-path        path_beg     /check
    use_backend healthcheck if hc-host hc-path

    option forwardfor

    server cp37 10.0.17.138:81 check
    server cp38 10.0.19.146:81 check

backend healthcheck
    option forwardfor
    server hc_server 127.0.0.1:81 maxconn 100

frontend stats
    no log
    maxconn 1000
    bind :9422
    bind :::9422 v6only
    http-request use-service prometheus-exporter if { path /metrics }
    stats enable
    stats uri /stats
    stats refresh 10s
    # Explicitly avoid keep-alive to prevent Prometheus scrapers from
    # reusing indefinitelly the same TCP connection. See T343000
    http-after-response set-header Connection Close

listen mw151_backend_tls
    no log
    bind :8113
    bind :::8113 v6only

    http-reuse never

    # time to wait for a complete HTTP request, It only applies to the header part of the HTTP request (unless option http-buffer-request is used)
    timeout http-request 3600s
    # set the maximum allowed time to wait for a new HTTP request to appear
    timeout http-keep-alive 120s
    # set the maximum inactivity time on the client side
    timeout client 120s
    # inactivity timeout on the client side for half-closed connections
    timeout client-fin 120s
    # connect timeout against a backend server
    timeout connect 3s
    # set the maximum inactivity time on the server side
    timeout server 180s
    # timeout used after upgrading a connection (websockets) or after the first response when no keepalive/close option is specified
    timeout tunnel 3600s

    # We don't use a healthchecker because we use varnish for this.
    server mw151_backend mw151.wikitide.net:443 ssl check-ssl verify none

listen mon181_backend_tls
    no log
    bind :8201
    bind :::8201 v6only

    http-reuse never

    # time to wait for a complete HTTP request, It only applies to the header part of the HTTP request (unless option http-buffer-request is used)
    timeout http-request 3600s
    # set the maximum allowed time to wait for a new HTTP request to appear
    timeout http-keep-alive 120s
    # set the maximum inactivity time on the client side
    timeout client 120s
    # inactivity timeout on the client side for half-closed connections
    timeout client-fin 120s
    # connect timeout against a backend server
    timeout connect 3s
    # set the maximum inactivity time on the server side
    timeout server 180s
    # timeout used after upgrading a connection (websockets) or after the first response when no keepalive/close option is specified
    timeout tunnel 3600s

    # We don't use a healthchecker because we use varnish for this.
    server mon181_backend mon181.wikitide.net:443 ssl check-ssl verify none

So random wiki requests are getting routed wrongly to mon181. We load balance varnish to ensure cache hits increase (hash the url).

e.g. test.miraheze.org → cloudflare → haproxy → varnish → mon181. We use haproxy to connect to a backend using tls for varnish as well.

Not sure what I’m doing wrong or how to fix it. (I have http-reuse never in the above as that fixed it).

I’m not sure what haproxy can do here, it sounds like you need to troubleshooting this in varnish.

Even if haproxy would do something wrong here, you first have to find out what it is that haproxy does wrong and you can only find out by troubleshooting the reason varnish makes the wrong routing decision in the first place.