mirror of https://github.com/F-Stack/f-stack.git
The TCP stack has been changed to use the estimated RTT instead of timestamps for receive buffer auto resizing.
Corresponding upstream changeset from https://svnweb.freebsd.org/base?view=revision&revision=317368.
This commit is contained in:
parent
56d87bf891
commit
04b1440d33
|
@ -132,6 +132,14 @@ SDT_PROBE_DEFINE6_XLATE(tcp, , , state__change,
|
||||||
"void *", "void *",
|
"void *", "void *",
|
||||||
"int", "tcplsinfo_t *");
|
"int", "tcplsinfo_t *");
|
||||||
|
|
||||||
|
SDT_PROBE_DEFINE6_XLATE(tcp, , , receive__autoresize,
|
||||||
|
"void *", "void *",
|
||||||
|
"struct tcpcb *", "csinfo_t *",
|
||||||
|
"struct mbuf *", "ipinfo_t *",
|
||||||
|
"struct tcpcb *", "tcpsinfo_t *" ,
|
||||||
|
"struct tcphdr *", "tcpinfoh_t *",
|
||||||
|
"int", "int");
|
||||||
|
|
||||||
SDT_PROBE_DEFINE5_XLATE(udp, , , receive,
|
SDT_PROBE_DEFINE5_XLATE(udp, , , receive,
|
||||||
"void *", "pktinfo_t *",
|
"void *", "pktinfo_t *",
|
||||||
"struct inpcb *", "csinfo_t *",
|
"struct inpcb *", "csinfo_t *",
|
||||||
|
|
|
@ -65,6 +65,7 @@ SDT_PROBE_DECLARE(tcp, , , debug__input);
|
||||||
SDT_PROBE_DECLARE(tcp, , , debug__output);
|
SDT_PROBE_DECLARE(tcp, , , debug__output);
|
||||||
SDT_PROBE_DECLARE(tcp, , , debug__user);
|
SDT_PROBE_DECLARE(tcp, , , debug__user);
|
||||||
SDT_PROBE_DECLARE(tcp, , , debug__drop);
|
SDT_PROBE_DECLARE(tcp, , , debug__drop);
|
||||||
|
SDT_PROBE_DECLARE(tcp, , , receive__autoresize);
|
||||||
|
|
||||||
SDT_PROBE_DECLARE(udp, , , receive);
|
SDT_PROBE_DECLARE(udp, , , receive);
|
||||||
SDT_PROBE_DECLARE(udp, , , send);
|
SDT_PROBE_DECLARE(udp, , , send);
|
||||||
|
|
|
@ -1494,6 +1494,68 @@ drop:
|
||||||
return (IPPROTO_DONE);
|
return (IPPROTO_DONE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Automatic sizing of receive socket buffer. Often the send
|
||||||
|
* buffer size is not optimally adjusted to the actual network
|
||||||
|
* conditions at hand (delay bandwidth product). Setting the
|
||||||
|
* buffer size too small limits throughput on links with high
|
||||||
|
* bandwidth and high delay (eg. trans-continental/oceanic links).
|
||||||
|
*
|
||||||
|
* On the receive side the socket buffer memory is only rarely
|
||||||
|
* used to any significant extent. This allows us to be much
|
||||||
|
* more aggressive in scaling the receive socket buffer. For
|
||||||
|
* the case that the buffer space is actually used to a large
|
||||||
|
* extent and we run out of kernel memory we can simply drop
|
||||||
|
* the new segments; TCP on the sender will just retransmit it
|
||||||
|
* later. Setting the buffer size too big may only consume too
|
||||||
|
* much kernel memory if the application doesn't read() from
|
||||||
|
* the socket or packet loss or reordering makes use of the
|
||||||
|
* reassembly queue.
|
||||||
|
*
|
||||||
|
* The criteria to step up the receive buffer one notch are:
|
||||||
|
* 1. Application has not set receive buffer size with
|
||||||
|
* SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
|
||||||
|
* 2. the number of bytes received during the time it takes
|
||||||
|
* one timestamp to be reflected back to us (the RTT);
|
||||||
|
* 3. received bytes per RTT is within seven eighth of the
|
||||||
|
* current socket buffer size;
|
||||||
|
* 4. receive buffer size has not hit maximal automatic size;
|
||||||
|
*
|
||||||
|
* This algorithm does one step per RTT at most and only if
|
||||||
|
* we receive a bulk stream w/o packet losses or reorderings.
|
||||||
|
* Shrinking the buffer during idle times is not necessary as
|
||||||
|
* it doesn't consume any memory when idle.
|
||||||
|
*
|
||||||
|
* TODO: Only step up if the application is actually serving
|
||||||
|
* the buffer to better manage the socket buffer resources.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so,
|
||||||
|
struct tcpcb *tp, int tlen)
|
||||||
|
{
|
||||||
|
int newsize = 0;
|
||||||
|
|
||||||
|
if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) &&
|
||||||
|
tp->t_srtt != 0 && tp->rfbuf_ts != 0 &&
|
||||||
|
TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) >
|
||||||
|
(tp->t_srtt >> TCP_RTT_SHIFT)) {
|
||||||
|
if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) &&
|
||||||
|
so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) {
|
||||||
|
newsize = min(so->so_rcv.sb_hiwat +
|
||||||
|
V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max);
|
||||||
|
}
|
||||||
|
TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize);
|
||||||
|
|
||||||
|
/* Start over with next RTT. */
|
||||||
|
tp->rfbuf_ts = 0;
|
||||||
|
tp->rfbuf_cnt = 0;
|
||||||
|
} else {
|
||||||
|
tp->rfbuf_cnt += tlen; /* add up */
|
||||||
|
}
|
||||||
|
|
||||||
|
return (newsize);
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
|
tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
|
||||||
struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
|
struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
|
||||||
|
@ -1847,62 +1909,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
|
||||||
#endif
|
#endif
|
||||||
TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
|
TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
|
||||||
|
|
||||||
/*
|
newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
|
||||||
* Automatic sizing of receive socket buffer. Often the send
|
|
||||||
* buffer size is not optimally adjusted to the actual network
|
|
||||||
* conditions at hand (delay bandwidth product). Setting the
|
|
||||||
* buffer size too small limits throughput on links with high
|
|
||||||
* bandwidth and high delay (eg. trans-continental/oceanic links).
|
|
||||||
*
|
|
||||||
* On the receive side the socket buffer memory is only rarely
|
|
||||||
* used to any significant extent. This allows us to be much
|
|
||||||
* more aggressive in scaling the receive socket buffer. For
|
|
||||||
* the case that the buffer space is actually used to a large
|
|
||||||
* extent and we run out of kernel memory we can simply drop
|
|
||||||
* the new segments; TCP on the sender will just retransmit it
|
|
||||||
* later. Setting the buffer size too big may only consume too
|
|
||||||
* much kernel memory if the application doesn't read() from
|
|
||||||
* the socket or packet loss or reordering makes use of the
|
|
||||||
* reassembly queue.
|
|
||||||
*
|
|
||||||
* The criteria to step up the receive buffer one notch are:
|
|
||||||
* 1. Application has not set receive buffer size with
|
|
||||||
* SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
|
|
||||||
* 2. the number of bytes received during the time it takes
|
|
||||||
* one timestamp to be reflected back to us (the RTT);
|
|
||||||
* 3. received bytes per RTT is within seven eighth of the
|
|
||||||
* current socket buffer size;
|
|
||||||
* 4. receive buffer size has not hit maximal automatic size;
|
|
||||||
*
|
|
||||||
* This algorithm does one step per RTT at most and only if
|
|
||||||
* we receive a bulk stream w/o packet losses or reorderings.
|
|
||||||
* Shrinking the buffer during idle times is not necessary as
|
|
||||||
* it doesn't consume any memory when idle.
|
|
||||||
*
|
|
||||||
* TODO: Only step up if the application is actually serving
|
|
||||||
* the buffer to better manage the socket buffer resources.
|
|
||||||
*/
|
|
||||||
if (V_tcp_do_autorcvbuf &&
|
|
||||||
(to.to_flags & TOF_TS) &&
|
|
||||||
to.to_tsecr &&
|
|
||||||
(so->so_rcv.sb_flags & SB_AUTOSIZE)) {
|
|
||||||
if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) &&
|
|
||||||
to.to_tsecr - tp->rfbuf_ts < hz) {
|
|
||||||
if (tp->rfbuf_cnt >
|
|
||||||
(so->so_rcv.sb_hiwat / 8 * 7) &&
|
|
||||||
so->so_rcv.sb_hiwat <
|
|
||||||
V_tcp_autorcvbuf_max) {
|
|
||||||
newsize =
|
|
||||||
min(so->so_rcv.sb_hiwat +
|
|
||||||
V_tcp_autorcvbuf_inc,
|
|
||||||
V_tcp_autorcvbuf_max);
|
|
||||||
}
|
|
||||||
/* Start over with next RTT. */
|
|
||||||
tp->rfbuf_ts = 0;
|
|
||||||
tp->rfbuf_cnt = 0;
|
|
||||||
} else
|
|
||||||
tp->rfbuf_cnt += tlen; /* add up */
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Add data to socket buffer. */
|
/* Add data to socket buffer. */
|
||||||
SOCKBUF_LOCK(&so->so_rcv);
|
SOCKBUF_LOCK(&so->so_rcv);
|
||||||
|
@ -1943,10 +1950,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
|
||||||
win = 0;
|
win = 0;
|
||||||
tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
|
tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
|
||||||
|
|
||||||
/* Reset receive buffer auto scaling when not in bulk receive mode. */
|
|
||||||
tp->rfbuf_ts = 0;
|
|
||||||
tp->rfbuf_cnt = 0;
|
|
||||||
|
|
||||||
switch (tp->t_state) {
|
switch (tp->t_state) {
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -798,11 +798,13 @@ send:
|
||||||
to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
|
to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
|
||||||
to.to_tsecr = tp->ts_recent;
|
to.to_tsecr = tp->ts_recent;
|
||||||
to.to_flags |= TOF_TS;
|
to.to_flags |= TOF_TS;
|
||||||
/* Set receive buffer autosizing timestamp. */
|
|
||||||
if (tp->rfbuf_ts == 0 &&
|
|
||||||
(so->so_rcv.sb_flags & SB_AUTOSIZE))
|
|
||||||
tp->rfbuf_ts = tcp_ts_getticks();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Set receive buffer autosizing timestamp. */
|
||||||
|
if (tp->rfbuf_ts == 0 &&
|
||||||
|
(so->so_rcv.sb_flags & SB_AUTOSIZE))
|
||||||
|
tp->rfbuf_ts = tcp_ts_getticks();
|
||||||
|
|
||||||
/* Selective ACK's. */
|
/* Selective ACK's. */
|
||||||
if (tp->t_flags & TF_SACK_PERMIT) {
|
if (tp->t_flags & TF_SACK_PERMIT) {
|
||||||
if (flags & TH_SYN)
|
if (flags & TH_SYN)
|
||||||
|
|
|
@ -396,62 +396,8 @@ tcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
|
||||||
(void *)tcp_saveipgen, &tcp_savetcp, 0);
|
(void *)tcp_saveipgen, &tcp_savetcp, 0);
|
||||||
#endif
|
#endif
|
||||||
TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
|
TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
|
||||||
/*
|
|
||||||
* Automatic sizing of receive socket buffer. Often the send
|
newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
|
||||||
* buffer size is not optimally adjusted to the actual network
|
|
||||||
* conditions at hand (delay bandwidth product). Setting the
|
|
||||||
* buffer size too small limits throughput on links with high
|
|
||||||
* bandwidth and high delay (eg. trans-continental/oceanic links).
|
|
||||||
*
|
|
||||||
* On the receive side the socket buffer memory is only rarely
|
|
||||||
* used to any significant extent. This allows us to be much
|
|
||||||
* more aggressive in scaling the receive socket buffer. For
|
|
||||||
* the case that the buffer space is actually used to a large
|
|
||||||
* extent and we run out of kernel memory we can simply drop
|
|
||||||
* the new segments; TCP on the sender will just retransmit it
|
|
||||||
* later. Setting the buffer size too big may only consume too
|
|
||||||
* much kernel memory if the application doesn't read() from
|
|
||||||
* the socket or packet loss or reordering makes use of the
|
|
||||||
* reassembly queue.
|
|
||||||
*
|
|
||||||
* The criteria to step up the receive buffer one notch are:
|
|
||||||
* 1. Application has not set receive buffer size with
|
|
||||||
* SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
|
|
||||||
* 2. the number of bytes received during the time it takes
|
|
||||||
* one timestamp to be reflected back to us (the RTT);
|
|
||||||
* 3. received bytes per RTT is within seven eighth of the
|
|
||||||
* current socket buffer size;
|
|
||||||
* 4. receive buffer size has not hit maximal automatic size;
|
|
||||||
*
|
|
||||||
* This algorithm does one step per RTT at most and only if
|
|
||||||
* we receive a bulk stream w/o packet losses or reorderings.
|
|
||||||
* Shrinking the buffer during idle times is not necessary as
|
|
||||||
* it doesn't consume any memory when idle.
|
|
||||||
*
|
|
||||||
* TODO: Only step up if the application is actually serving
|
|
||||||
* the buffer to better manage the socket buffer resources.
|
|
||||||
*/
|
|
||||||
if (V_tcp_do_autorcvbuf &&
|
|
||||||
(to->to_flags & TOF_TS) &&
|
|
||||||
to->to_tsecr &&
|
|
||||||
(so->so_rcv.sb_flags & SB_AUTOSIZE)) {
|
|
||||||
if (TSTMP_GT(to->to_tsecr, tp->rfbuf_ts) &&
|
|
||||||
to->to_tsecr - tp->rfbuf_ts < hz) {
|
|
||||||
if (tp->rfbuf_cnt >
|
|
||||||
(so->so_rcv.sb_hiwat / 8 * 7) &&
|
|
||||||
so->so_rcv.sb_hiwat <
|
|
||||||
V_tcp_autorcvbuf_max) {
|
|
||||||
newsize =
|
|
||||||
min(so->so_rcv.sb_hiwat +
|
|
||||||
V_tcp_autorcvbuf_inc,
|
|
||||||
V_tcp_autorcvbuf_max);
|
|
||||||
}
|
|
||||||
/* Start over with next RTT. */
|
|
||||||
tp->rfbuf_ts = 0;
|
|
||||||
tp->rfbuf_cnt = 0;
|
|
||||||
} else
|
|
||||||
tp->rfbuf_cnt += tlen; /* add up */
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Add data to socket buffer. */
|
/* Add data to socket buffer. */
|
||||||
SOCKBUF_LOCK(&so->so_rcv);
|
SOCKBUF_LOCK(&so->so_rcv);
|
||||||
|
@ -526,10 +472,6 @@ tcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so,
|
||||||
win = 0;
|
win = 0;
|
||||||
tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
|
tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
|
||||||
|
|
||||||
/* Reset receive buffer auto scaling when not in bulk receive mode. */
|
|
||||||
tp->rfbuf_ts = 0;
|
|
||||||
tp->rfbuf_cnt = 0;
|
|
||||||
|
|
||||||
switch (tp->t_state) {
|
switch (tp->t_state) {
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -779,6 +779,8 @@ void hhook_run_tcp_est_in(struct tcpcb *tp,
|
||||||
struct tcphdr *th, struct tcpopt *to);
|
struct tcphdr *th, struct tcpopt *to);
|
||||||
|
|
||||||
int tcp_input(struct mbuf **, int *, int);
|
int tcp_input(struct mbuf **, int *, int);
|
||||||
|
int tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *,
|
||||||
|
struct tcpcb *, int);
|
||||||
void tcp_do_segment(struct mbuf *, struct tcphdr *,
|
void tcp_do_segment(struct mbuf *, struct tcphdr *,
|
||||||
struct socket *, struct tcpcb *, int, int, uint8_t,
|
struct socket *, struct tcpcb *, int, int, uint8_t,
|
||||||
int);
|
int);
|
||||||
|
|
Loading…
Reference in New Issue