The TCP stack has been changed to use the estimated RTT instead of timestamps for receive buffer auto resizing.

Corresponding upstream changeset from https://svnweb.freebsd.org/base?view=revision&revision=317368.
2019-11-22 11:40:45 +08:00 · 2019-11-22 11:40:45 +08:00 · 04b1440d33
parent 56d87bf891
commit 04b1440d33
6 changed files with 82 additions and 124 deletions
--- a/freebsd/netinet/in_kdtrace.c
+++ b/freebsd/netinet/in_kdtrace.c
@ -132,6 +132,14 @@ SDT_PROBE_DEFINE6_XLATE(tcp, , , state__change,
    "void *", "void *",
    "int", "tcplsinfo_t *");
 SDT_PROBE_DEFINE6_XLATE(tcp, , , receive__autoresize,
    "void *", "void *",
    "struct tcpcb *", "csinfo_t *",
    "struct mbuf *", "ipinfo_t *",
    "struct tcpcb *", "tcpsinfo_t *" ,
    "struct tcphdr *", "tcpinfoh_t *",
    "int", "int");
 SDT_PROBE_DEFINE5_XLATE(udp, , , receive,
    "void *", "pktinfo_t *",
    "struct inpcb *", "csinfo_t *",
--- a/freebsd/netinet/in_kdtrace.h
+++ b/freebsd/netinet/in_kdtrace.h
@ -65,6 +65,7 @@ SDT_PROBE_DECLARE(tcp, , , debug__input);
 SDT_PROBE_DECLARE(tcp, , , debug__output);
 SDT_PROBE_DECLARE(tcp, , , debug__user);
 SDT_PROBE_DECLARE(tcp, , , debug__drop);
 SDT_PROBE_DECLARE(tcp, , , receive__autoresize);
 SDT_PROBE_DECLARE(udp, , , receive);
 SDT_PROBE_DECLARE(udp, , , send);
--- a/freebsd/netinet/tcp_input.c
+++ b/freebsd/netinet/tcp_input.c
@ -1494,6 +1494,68 @@ drop:
 	return (IPPROTO_DONE);
 }
 /*
 * Automatic sizing of receive socket buffer.  Often the send
 * buffer size is not optimally adjusted to the actual network
 * conditions at hand (delay bandwidth product).  Setting the
 * buffer size too small limits throughput on links with high
 * bandwidth and high delay (eg. trans-continental/oceanic links).
 *
 * On the receive side the socket buffer memory is only rarely
 * used to any significant extent.  This allows us to be much
 * more aggressive in scaling the receive socket buffer.  For
 * the case that the buffer space is actually used to a large
 * extent and we run out of kernel memory we can simply drop
 * the new segments; TCP on the sender will just retransmit it
 * later.  Setting the buffer size too big may only consume too
 * much kernel memory if the application doesn't read() from
 * the socket or packet loss or reordering makes use of the
 * reassembly queue.
 *
 * The criteria to step up the receive buffer one notch are:
 *  1. Application has not set receive buffer size with
 *     SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
 *  2. the number of bytes received during the time it takes
 *     one timestamp to be reflected back to us (the RTT);
 *  3. received bytes per RTT is within seven eighth of the
 *     current socket buffer size;
 *  4. receive buffer size has not hit maximal automatic size;
 *
 * This algorithm does one step per RTT at most and only if
 * we receive a bulk stream w/o packet losses or reorderings.
 * Shrinking the buffer during idle times is not necessary as
 * it doesn't consume any memory when idle.
 *
 * TODO: Only step up if the application is actually serving
 * the buffer to better manage the socket buffer resources.
 */
 int
 tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so,
    struct tcpcb *tp, int tlen)
 {
    int newsize = 0;
    if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) &&
        tp->t_srtt != 0 && tp->rfbuf_ts != 0 &&
        TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) >
        (tp->t_srtt >> TCP_RTT_SHIFT)) {
            if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) &&
                so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) {
                    newsize = min(so->so_rcv.sb_hiwat +
                        V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max);
            }
            TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize);
            /* Start over with next RTT. */
            tp->rfbuf_ts = 0;
            tp->rfbuf_cnt = 0;
    } else {
            tp->rfbuf_cnt += tlen;  /* add up */
    }
    return (newsize);
 }
 void
 tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
    struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
@ -1847,62 +1909,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 #endif
 			TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
-		/*
+			newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
 		 * Automatic sizing of receive socket buffer.  Often the send
 		 * buffer size is not optimally adjusted to the actual network
 		 * conditions at hand (delay bandwidth product).  Setting the
 		 * buffer size too small limits throughput on links with high
 		 * bandwidth and high delay (eg. trans-continental/oceanic links).
 		 *
 		 * On the receive side the socket buffer memory is only rarely
 		 * used to any significant extent.  This allows us to be much
 		 * more aggressive in scaling the receive socket buffer.  For
 		 * the case that the buffer space is actually used to a large
 		 * extent and we run out of kernel memory we can simply drop
 		 * the new segments; TCP on the sender will just retransmit it
 		 * later.  Setting the buffer size too big may only consume too
 		 * much kernel memory if the application doesn't read() from
 		 * the socket or packet loss or reordering makes use of the
 		 * reassembly queue.
 		 *
 		 * The criteria to step up the receive buffer one notch are:
 		 *  1. Application has not set receive buffer size with
 		 *     SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
 		 *  2. the number of bytes received during the time it takes
 		 *     one timestamp to be reflected back to us (the RTT);
 		 *  3. received bytes per RTT is within seven eighth of the
 		 *     current socket buffer size;
 		 *  4. receive buffer size has not hit maximal automatic size;
 		 *
 		 * This algorithm does one step per RTT at most and only if
 		 * we receive a bulk stream w/o packet losses or reorderings.
 		 * Shrinking the buffer during idle times is not necessary as
 		 * it doesn't consume any memory when idle.
 		 *
 		 * TODO: Only step up if the application is actually serving
 		 * the buffer to better manage the socket buffer resources.
 		 */
 			if (V_tcp_do_autorcvbuf &&
 			    (to.to_flags & TOF_TS) &&
 			    to.to_tsecr &&
 			    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
 				if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) &&
 				    to.to_tsecr - tp->rfbuf_ts < hz) {
 					if (tp->rfbuf_cnt >
 					    (so->so_rcv.sb_hiwat / 8 * 7) &&
 					    so->so_rcv.sb_hiwat <
 					    V_tcp_autorcvbuf_max) {
 						newsize =
 						    min(so->so_rcv.sb_hiwat +
 						    V_tcp_autorcvbuf_inc,
 						    V_tcp_autorcvbuf_max);
 					}
 					/* Start over with next RTT. */
 					tp->rfbuf_ts = 0;
 					tp->rfbuf_cnt = 0;
 				} else
 					tp->rfbuf_cnt += tlen;	/* add up */
 			}
 			/* Add data to socket buffer. */
 			SOCKBUF_LOCK(&so->so_rcv);
@ -1943,10 +1950,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 		win = 0;
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 	/* Reset receive buffer auto scaling when not in bulk receive mode. */
 	tp->rfbuf_ts = 0;
 	tp->rfbuf_cnt = 0;
 	switch (tp->t_state) {
 	/*
--- a/freebsd/netinet/tcp_output.c
+++ b/freebsd/netinet/tcp_output.c
@ -798,11 +798,13 @@ send:
 			to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
 			to.to_tsecr = tp->ts_recent;
 			to.to_flags |= TOF_TS;
 			/* Set receive buffer autosizing timestamp. */
 			if (tp->rfbuf_ts == 0 &&
 			    (so->so_rcv.sb_flags & SB_AUTOSIZE))
 				tp->rfbuf_ts = tcp_ts_getticks();
 		}
 		/* Set receive buffer autosizing timestamp. */
 		if (tp->rfbuf_ts == 0 &&
 		    (so->so_rcv.sb_flags & SB_AUTOSIZE))
 			tp->rfbuf_ts = tcp_ts_getticks();
 		/* Selective ACK's. */
 		if (tp->t_flags & TF_SACK_PERMIT) {
 			if (flags & TH_SYN)
--- a/freebsd/netinet/tcp_stacks/fastpath.c
+++ b/freebsd/netinet/tcp_stacks/fastpath.c
@ -396,62 +396,8 @@ tcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
 			  (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
 	TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
-	/*
+
-	 * Automatic sizing of receive socket buffer.  Often the send
+	newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
 	 * buffer size is not optimally adjusted to the actual network
 	 * conditions at hand (delay bandwidth product).  Setting the
 	 * buffer size too small limits throughput on links with high
 	 * bandwidth and high delay (eg. trans-continental/oceanic links).
 	 *
 	 * On the receive side the socket buffer memory is only rarely
 	 * used to any significant extent.  This allows us to be much
 	 * more aggressive in scaling the receive socket buffer.  For
 	 * the case that the buffer space is actually used to a large
 	 * extent and we run out of kernel memory we can simply drop
 	 * the new segments; TCP on the sender will just retransmit it
 	 * later.  Setting the buffer size too big may only consume too
 	 * much kernel memory if the application doesn't read() from
 	 * the socket or packet loss or reordering makes use of the
 	 * reassembly queue.
 	 *
 	 * The criteria to step up the receive buffer one notch are:
 	 *  1. Application has not set receive buffer size with
 	 *     SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
 	 *  2. the number of bytes received during the time it takes
 	 *     one timestamp to be reflected back to us (the RTT);
 	 *  3. received bytes per RTT is within seven eighth of the
 	 *     current socket buffer size;
 	 *  4. receive buffer size has not hit maximal automatic size;
 	 *
 	 * This algorithm does one step per RTT at most and only if
 	 * we receive a bulk stream w/o packet losses or reorderings.
 	 * Shrinking the buffer during idle times is not necessary as
 	 * it doesn't consume any memory when idle.
 	 *
 	 * TODO: Only step up if the application is actually serving
 	 * the buffer to better manage the socket buffer resources.
 	 */
 	if (V_tcp_do_autorcvbuf &&
 	    (to->to_flags & TOF_TS) &&
 	    to->to_tsecr &&
 	    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
 		if (TSTMP_GT(to->to_tsecr, tp->rfbuf_ts) &&
 		    to->to_tsecr - tp->rfbuf_ts < hz) {
 			if (tp->rfbuf_cnt >
 			    (so->so_rcv.sb_hiwat / 8 * 7) &&
 			    so->so_rcv.sb_hiwat <
 			    V_tcp_autorcvbuf_max) {
 				newsize =
 					min(so->so_rcv.sb_hiwat +
 					    V_tcp_autorcvbuf_inc,
 					    V_tcp_autorcvbuf_max);
 			}
 			/* Start over with next RTT. */
 			tp->rfbuf_ts = 0;
 			tp->rfbuf_cnt = 0;
 		} else
 			tp->rfbuf_cnt += tlen;	/* add up */
 	}
 	/* Add data to socket buffer. */
 	SOCKBUF_LOCK(&so->so_rcv);
@ -526,10 +472,6 @@ tcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so,
 		win = 0;
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 	/* Reset receive buffer auto scaling when not in bulk receive mode. */
 	tp->rfbuf_ts = 0;
 	tp->rfbuf_cnt = 0;
 	switch (tp->t_state) {
 	/*
--- a/freebsd/netinet/tcp_var.h
+++ b/freebsd/netinet/tcp_var.h
@ -779,6 +779,8 @@ void	hhook_run_tcp_est_in(struct tcpcb *tp,
 			    struct tcphdr *th, struct tcpopt *to);
 int	 tcp_input(struct mbuf **, int *, int);
 int	 tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *,
 		struct tcpcb *, int);
 void	 tcp_do_segment(struct mbuf *, struct tcphdr *,
 			struct socket *, struct tcpcb *, int, int, uint8_t,
 			int);