# # Copyright (C) 2000, 2001 Los Alamos National Laboratory # P.O. Box 1663, Los Alamos NM 87545 USA # # This file is the DRS patch to the Linux 2.4 kernel developed by the # RADIANT team of Los Alamos National # Laboratory. # # DRS is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your # option) any later version. # # DRS is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. # # You should have received a copy of the GNU General Public License # along with DRS; if not, write to the Free Software Foundation, # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # While this software is free according to the GNU General Public # License, we ask that you send any bug fixes or feature enhancements # back to radiant-software@lanl.gov for incorporation into the official # release. # diff -r -U3 kernel-source-2.4.8/include/net/sock.h kernel-source-2.4.8.drs.clean/include/net/sock.h --- kernel-source-2.4.8/include/net/sock.h Fri Apr 27 16:49:39 2001 +++ kernel-source-2.4.8.drs.clean/include/net/sock.h Thu Nov 8 11:16:19 2001 @@ -416,6 +416,18 @@ unsigned int keepalive_time; /* time before keep alive takes place */ unsigned int keepalive_intvl; /* time interval between keep alive probes */ int linger2; + + +/* + * Receiver window right-sizing + */ + __u32 art_cts; /* next packet wanted */ + __u32 art_seq; /* next packet wanted */ + __u32 art_time; /* when we sent ack that cleared art_seq */ + __u32 art_low; /* when we sent ack that cleared art_seq */ + __u32 bpw_seq; + __u32 bpw_time; + }; diff -r -U3 kernel-source-2.4.8/include/net/tcp.h kernel-source-2.4.8.drs.clean/include/net/tcp.h --- kernel-source-2.4.8/include/net/tcp.h Fri Apr 27 16:50:22 2001 +++ kernel-source-2.4.8.drs.clean/include/net/tcp.h Thu Jan 10 08:49:20 2002 @@ -1485,6 +1485,12 @@ __u8 *rcv_wscale) { /* If no clamp set the clamp to the max possible scaled window */ + if (*window_clamp != 0) { +#ifdef DRS_VERBOSE + printk(KERN_WARNING "Ignoring initial clamp of %d\n", *window_clamp); +#endif + *window_clamp = 0; + } if (*window_clamp == 0) (*window_clamp) = (65535<<14); space = min(*window_clamp,space); @@ -1502,9 +1508,18 @@ (*rcv_wnd) = min(space, MAX_TCP_WINDOW); (*rcv_wscale) = 0; if (wscale_ok) { + /* Scale to hold max buffer size even if the current + * space is smaller. This allows the receive window to + * grow dynamically + */ + __u32 maxw; + extern __u32 sysctl_rmem_max; + + maxw = max(sysctl_tcp_rmem[2], sysctl_rmem_max); + /* See RFC1323 for an explanation of the limit to 14 */ - while (space > 65535 && (*rcv_wscale) < 14) { - space >>= 1; + while (maxw > 65535 && (*rcv_wscale) < 14) { + maxw >>= 1; (*rcv_wscale)++; } if (*rcv_wscale && sysctl_tcp_app_win && space>=mss && diff -r -U3 kernel-source-2.4.8/net/ipv4/tcp_input.c kernel-source-2.4.8.drs.clean/net/ipv4/tcp_input.c --- kernel-source-2.4.8/net/ipv4/tcp_input.c Mon Jun 11 20:15:27 2001 +++ kernel-source-2.4.8.drs.clean/net/ipv4/tcp_input.c Mon Jan 14 11:56:10 2002 @@ -69,6 +69,7 @@ #include #include +extern __u32 sysctl_rmem_default; /* These are on by default so the code paths get tested. * For the final 2.2 this may be undone at our discretion. -DaveM @@ -1705,7 +1706,13 @@ /* In "safe" area, increase. */ if (tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++; +#ifdef DRS_VERBOSE + else printk(KERN_WARNING "cwnd clamped at %d\n", tp->snd_cwnd_clamp); +#endif } else { +#ifdef DRS_VERBOSE + printk(KERN_WARNING "cwnd at snd_ssthresh %d\n", tp->snd_ssthresh); +#endif /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ @@ -2926,6 +2933,141 @@ } +static __inline__ __u32 difference(__u32 start, __u32 finish) +{ + return (__u32)((__s32)(finish - start) & 0x7fffffff); + /* return (finish - start); */ +} + +/* + * DRS: + * + * Receiver-side round-trip time estimation + * + * For uni-directional connections (like FTP data connections), the sender + * round-trip time measurements may not take place. + * The window tuning functions below depend on the availability of good + * round-trip times. So, here we implement round-trip timing based + * on the length of time between when we first advertise a window including + * sequence number X and when we receive a packet with that sequence number + * + * Unless the sender is violating standards, that cannot occur for + * _at least_ one round-trip time. + * If the sender wasn't ready to send data, the measurement may be much larger + * than the actual rtt. We use the measurements as an upper-bound and + * keep track of and use the minimum measured rtt. + * + * FLAW: If the rtt grows, we won't notice and the window will not grow + * accordingly + */ +static __inline__ void tcp_rcv_rtt_est(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, struct tcp_opt *tp) +{ + /* + * If no previous measurement, or the previous measurement is stale, + * start a new measurement + * Note: after() case shouldn't ever happen + */ + if (! tp->art_time) { + return; + } else if (after(TCP_SKB_CB(skb)->seq, tp->art_seq)) { +#ifdef DRS_VERBOSE + printk("ART orphan! (fixed)\n"); +#endif + tp->art_time = 0; + } else if (between(tp->art_seq + 1, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { + __u32 rtt = difference(tp->art_time, jiffies); +#ifdef DRS_VERBOSE + printk("ART port %hu from %lu: %lums = %lu - %lu\n", + ntohs(th->dest), ntohl(sk->daddr), + 1000 * rtt / HZ, + tp->lrcvtime, tp->art_time); + + printk("ART port %hu from %lu: %lums, low %lums, srtt %lums\n", + ntohs(th->dest), ntohl(sk->daddr), + 1000 * rtt / HZ, + 1000 * tp->art_low /HZ, + (1000 * tp->srtt / HZ) >> 3 ); +#endif + + if (!tp->art_low || (rtt < tp->art_low)) + tp->art_low = rtt; + + /* tcp_rtt_estimator(tp, rtt); */ + + tp->art_time = 0; /* Make a new measurement */ + } +} + +/* + * DRS: + * Automatically tune-up the receive buffer space (and consequently TCP window) + * to keep pace with high delay-bandwidth connections. + */ +static __inline__ void tcp_auto_window(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, struct tcp_opt *tp) +{ + int rtt = tp->srtt; + + /* Use the srtt, or in its absence, calculate a bulk-data receive rtt */ + if (! rtt) { + tcp_rcv_rtt_est(sk, skb, th, tp); + + /* Use the low-water mark for bulk data transfer receive rtt, + scaled like srtt */ + rtt = tp->art_low << 3; + } + + if (!tp->bpw_time || !tp->bpw_seq) goto reset_bpw; + + /* + * Monitor the number of bytes received per round-trip time + * Bytes received out of order might not be counted + */ + if (after(jiffies, tp->bpw_time + (rtt>>3))) { + /* Calculate the number of bytes since bpw_seq. */ + /* (not including current packet) */ + int progress = difference(tp->bpw_seq, tp->rcv_nxt); + + if (rtt && (! (sk->userlocks & SOCK_RCVBUF_LOCK))) { + +#ifdef DRS_VERBOSE + { + int rtt_ms = (1000 * rtt / HZ) >> 3; + int srtt_ms = (1000 * tp->srtt / HZ) >> 3; + int art_ms = (1000 * tp->art_low / HZ); + int rate = progress * 8 * 1000 / rtt_ms / 1024; + + printk("BPW port %d from %u: %d bytes/%d ms (%u,%u) = %d kbps, rbuf=%u/%u seq=%u:%u\n", + ntohs(th->dest), ntohl(sk->daddr), + progress, rtt_ms, srtt_ms, art_ms, rate, + atomic_read(&sk->rmem_alloc), sk->rcvbuf, + TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq + ); + } +#endif + sk->rcvbuf = max(sk->rcvbuf, 2 * progress + sysctl_rmem_default); + sk->rcvbuf = min(sk->rcvbuf, sysctl_rmem_max); + if (sk->rcvbuf > tp->window_clamp) tp->window_clamp = tcp_full_space(sk); + +#ifdef DRS_VERBOSE + printk("Raised receive buffer to %d, tp->window_clamp is %d\n", sk->rcvbuf, tp->window_clamp); + } else { + printk("BPW port %d from %u: %u bytes/%u ms = NaN kbps\n", + th->dest, sk->daddr, + progress, 1000 * (rtt>>3) / HZ + ); +#endif + } + + reset_bpw: + + tp->bpw_time = jiffies; + tp->bpw_seq = tp->rcv_nxt; + } +} + + /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. * As additional protections, we do not touch cwnd in retransmission phases, * and if application hit its sndbuf limit recently. @@ -2956,7 +3098,7 @@ { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - if (tp->packets_out < tp->snd_cwnd && + if ( !(sk->userlocks&SOCK_SNDBUF_LOCK) && !tcp_memory_pressure && atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { @@ -2968,6 +3110,9 @@ if (sndmem > sk->sndbuf) sk->sndbuf = min(sndmem, sysctl_tcp_wmem[2]); tp->snd_cwnd_stamp = tcp_time_stamp; +#ifdef DRS_VERBOSE + printk("demand is %d, sndmem wanted is %d, sndbuf now %d, flightsize is %d\n", demanded, sndmem, sk->sndbuf, tp->snd_nxt - tp->snd_una); +#endif } sk->write_space(sk); @@ -3224,6 +3369,8 @@ struct tcphdr *th, unsigned len) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + tcp_auto_window(sk, skb, th, tp); /* * Header prediction. diff -r -U3 kernel-source-2.4.8/net/ipv4/tcp_minisocks.c kernel-source-2.4.8.drs.clean/net/ipv4/tcp_minisocks.c --- kernel-source-2.4.8/net/ipv4/tcp_minisocks.c Tue Aug 7 09:30:50 2001 +++ kernel-source-2.4.8.drs.clean/net/ipv4/tcp_minisocks.c Tue Oct 16 10:36:42 2001 @@ -725,6 +725,11 @@ newtp->pushed_seq = newtp->write_seq; newtp->copied_seq = req->rcv_isn + 1; + newtp->art_seq = 0; + newtp->art_time = 0; + newtp->bpw_seq = 0; + newtp->bpw_time = 0; + newtp->saw_tstamp = 0; newtp->dsack = 0; diff -r -U3 kernel-source-2.4.8/net/ipv4/tcp_output.c kernel-source-2.4.8.drs.clean/net/ipv4/tcp_output.c --- kernel-source-2.4.8/net/ipv4/tcp_output.c Tue Jul 10 17:11:43 2001 +++ kernel-source-2.4.8.drs.clean/net/ipv4/tcp_output.c Tue Oct 16 17:40:35 2001 @@ -54,6 +54,57 @@ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); } +/* + * Begin (if appropriate) a receive-side rtt measruement by taking + * a newly ACKd seqno and looking for (seqno+window), which is the + * data currently being cleared for transmissiont. If the sender is + * network bound, that data will arrive in approximately 1 rount-trip time. + */ +static __inline__ void tcp_ack_rtt_est(struct sock *sk, struct tcphdr *th, + struct tcp_opt *tp) +{ + int cts; + + if (!tp->rcv_nxt) return; + + cts = tp->rcv_nxt + (ntohs(th->window) << tp->rcv_wscale); + + /* See if we're advancing the Clear To Send mark */ + if (!tp->art_cts || after(cts, tp->art_cts)) { + /* + * If we're not performing a measurement, + * start one using the first new sequence number + * being cleared to send by this packet. + * + * ack_seq is set from rcv_nxt which is the + * next unreceived byte. Thus, cts is the byte + * just past the last one being cleared. So, + * the old value is the first byte being cleared now. + * + * That sequence number shouldn't arrive for + * at least one round-trip time. + */ + if (! tp->art_time) { + /* Start a new measurement */ + tp->art_time = jiffies; + tp->art_seq = cts; /* Look for this seqno coming back */ +#ifdef DRS_VERBOSE + printk("ART lport %hu from %lu: CTS = %lu (%lu + %hu) measurement\n", + ntohs(th->source), ntohl(sk->daddr), + tp->art_cts, tp->rcv_nxt, ntohs(th->window) << tp->rcv_wscale); +#endif + } + + /* Save this hi-water CTS */ + tp->art_cts = cts; +/* + printk("ART lport %d from %u: CTS = %u (%u + %u)\n", + ntohs(th->source), ntohl(sk->daddr), + cts, tp->rcv_nxt, ntohs(th->window) << tp->rcv_wscale); +*/ + } +} + /* SND.NXT, if window was not shrunk. * If window has been shrunk, what should we make? It is not clear at all. * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( @@ -274,6 +325,8 @@ TCP_INC_STATS(TcpOutSegs); + tcp_ack_rtt_est(sk, th, tp); + err = tp->af_specific->queue_xmit(skb); if (err <= 0) return err; @@ -682,6 +735,9 @@ if (window <= free_space - mss || window > free_space) window = (free_space/mss)*mss; +#ifdef DRS_VERBOSE + printk(KERN_WARNING "Selecting window of %d (%d free, rcv_ssthresh=%d)\n", window, free_space,tp->rcv_ssthresh); +#endif return window; }