#
# Copyright (C) 2000, 2001 Los Alamos National Laboratory
# P.O. Box 1663, Los Alamos NM 87545 USA
#
# This file is the DRS patch to the Linux 2.4 kernel developed by the
# RADIANT team of Los Alamos National
# Laboratory.
#
# DRS is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# DRS is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# for more details.
#
# You should have received a copy of the GNU General Public License
# along with DRS; if not, write to the Free Software Foundation,
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# While this software is free according to the GNU General Public
# License, we ask that you send any bug fixes or feature enhancements
# back to radiant-software@lanl.gov for incorporation into the official
# release.
#
diff -r -U3 kernel-source-2.4.8/include/net/sock.h kernel-source-2.4.8.drs.clean/include/net/sock.h
--- kernel-source-2.4.8/include/net/sock.h Fri Apr 27 16:49:39 2001
+++ kernel-source-2.4.8.drs.clean/include/net/sock.h Thu Nov 8 11:16:19 2001
@@ -416,6 +416,18 @@
unsigned int keepalive_time; /* time before keep alive takes place */
unsigned int keepalive_intvl; /* time interval between keep alive probes */
int linger2;
+
+
+/*
+ * Receiver window right-sizing
+ */
+ __u32 art_cts; /* next packet wanted */
+ __u32 art_seq; /* next packet wanted */
+ __u32 art_time; /* when we sent ack that cleared art_seq */
+ __u32 art_low; /* when we sent ack that cleared art_seq */
+ __u32 bpw_seq;
+ __u32 bpw_time;
+
};
diff -r -U3 kernel-source-2.4.8/include/net/tcp.h kernel-source-2.4.8.drs.clean/include/net/tcp.h
--- kernel-source-2.4.8/include/net/tcp.h Fri Apr 27 16:50:22 2001
+++ kernel-source-2.4.8.drs.clean/include/net/tcp.h Thu Jan 10 08:49:20 2002
@@ -1485,6 +1485,12 @@
__u8 *rcv_wscale)
{
/* If no clamp set the clamp to the max possible scaled window */
+ if (*window_clamp != 0) {
+#ifdef DRS_VERBOSE
+ printk(KERN_WARNING "Ignoring initial clamp of %d\n", *window_clamp);
+#endif
+ *window_clamp = 0;
+ }
if (*window_clamp == 0)
(*window_clamp) = (65535<<14);
space = min(*window_clamp,space);
@@ -1502,9 +1508,18 @@
(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
(*rcv_wscale) = 0;
if (wscale_ok) {
+ /* Scale to hold max buffer size even if the current
+ * space is smaller. This allows the receive window to
+ * grow dynamically
+ */
+ __u32 maxw;
+ extern __u32 sysctl_rmem_max;
+
+ maxw = max(sysctl_tcp_rmem[2], sysctl_rmem_max);
+
/* See RFC1323 for an explanation of the limit to 14 */
- while (space > 65535 && (*rcv_wscale) < 14) {
- space >>= 1;
+ while (maxw > 65535 && (*rcv_wscale) < 14) {
+ maxw >>= 1;
(*rcv_wscale)++;
}
if (*rcv_wscale && sysctl_tcp_app_win && space>=mss &&
diff -r -U3 kernel-source-2.4.8/net/ipv4/tcp_input.c kernel-source-2.4.8.drs.clean/net/ipv4/tcp_input.c
--- kernel-source-2.4.8/net/ipv4/tcp_input.c Mon Jun 11 20:15:27 2001
+++ kernel-source-2.4.8.drs.clean/net/ipv4/tcp_input.c Mon Jan 14 11:56:10 2002
@@ -69,6 +69,7 @@
#include
#include
+extern __u32 sysctl_rmem_default;
/* These are on by default so the code paths get tested.
* For the final 2.2 this may be undone at our discretion. -DaveM
@@ -1705,7 +1706,13 @@
/* In "safe" area, increase. */
if (tp->snd_cwnd < tp->snd_cwnd_clamp)
tp->snd_cwnd++;
+#ifdef DRS_VERBOSE
+ else printk(KERN_WARNING "cwnd clamped at %d\n", tp->snd_cwnd_clamp);
+#endif
} else {
+#ifdef DRS_VERBOSE
+ printk(KERN_WARNING "cwnd at snd_ssthresh %d\n", tp->snd_ssthresh);
+#endif
/* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
*/
@@ -2926,6 +2933,141 @@
}
+static __inline__ __u32 difference(__u32 start, __u32 finish)
+{
+ return (__u32)((__s32)(finish - start) & 0x7fffffff);
+ /* return (finish - start); */
+}
+
+/*
+ * DRS:
+ *
+ * Receiver-side round-trip time estimation
+ *
+ * For uni-directional connections (like FTP data connections), the sender
+ * round-trip time measurements may not take place.
+ * The window tuning functions below depend on the availability of good
+ * round-trip times. So, here we implement round-trip timing based
+ * on the length of time between when we first advertise a window including
+ * sequence number X and when we receive a packet with that sequence number
+ *
+ * Unless the sender is violating standards, that cannot occur for
+ * _at least_ one round-trip time.
+ * If the sender wasn't ready to send data, the measurement may be much larger
+ * than the actual rtt. We use the measurements as an upper-bound and
+ * keep track of and use the minimum measured rtt.
+ *
+ * FLAW: If the rtt grows, we won't notice and the window will not grow
+ * accordingly
+ */
+static __inline__ void tcp_rcv_rtt_est(struct sock *sk, struct sk_buff *skb,
+ struct tcphdr *th, struct tcp_opt *tp)
+{
+ /*
+ * If no previous measurement, or the previous measurement is stale,
+ * start a new measurement
+ * Note: after() case shouldn't ever happen
+ */
+ if (! tp->art_time) {
+ return;
+ } else if (after(TCP_SKB_CB(skb)->seq, tp->art_seq)) {
+#ifdef DRS_VERBOSE
+ printk("ART orphan! (fixed)\n");
+#endif
+ tp->art_time = 0;
+ } else if (between(tp->art_seq + 1, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
+ __u32 rtt = difference(tp->art_time, jiffies);
+#ifdef DRS_VERBOSE
+ printk("ART port %hu from %lu: %lums = %lu - %lu\n",
+ ntohs(th->dest), ntohl(sk->daddr),
+ 1000 * rtt / HZ,
+ tp->lrcvtime, tp->art_time);
+
+ printk("ART port %hu from %lu: %lums, low %lums, srtt %lums\n",
+ ntohs(th->dest), ntohl(sk->daddr),
+ 1000 * rtt / HZ,
+ 1000 * tp->art_low /HZ,
+ (1000 * tp->srtt / HZ) >> 3 );
+#endif
+
+ if (!tp->art_low || (rtt < tp->art_low))
+ tp->art_low = rtt;
+
+ /* tcp_rtt_estimator(tp, rtt); */
+
+ tp->art_time = 0; /* Make a new measurement */
+ }
+}
+
+/*
+ * DRS:
+ * Automatically tune-up the receive buffer space (and consequently TCP window)
+ * to keep pace with high delay-bandwidth connections.
+ */
+static __inline__ void tcp_auto_window(struct sock *sk, struct sk_buff *skb,
+ struct tcphdr *th, struct tcp_opt *tp)
+{
+ int rtt = tp->srtt;
+
+ /* Use the srtt, or in its absence, calculate a bulk-data receive rtt */
+ if (! rtt) {
+ tcp_rcv_rtt_est(sk, skb, th, tp);
+
+ /* Use the low-water mark for bulk data transfer receive rtt,
+ scaled like srtt */
+ rtt = tp->art_low << 3;
+ }
+
+ if (!tp->bpw_time || !tp->bpw_seq) goto reset_bpw;
+
+ /*
+ * Monitor the number of bytes received per round-trip time
+ * Bytes received out of order might not be counted
+ */
+ if (after(jiffies, tp->bpw_time + (rtt>>3))) {
+ /* Calculate the number of bytes since bpw_seq. */
+ /* (not including current packet) */
+ int progress = difference(tp->bpw_seq, tp->rcv_nxt);
+
+ if (rtt && (! (sk->userlocks & SOCK_RCVBUF_LOCK))) {
+
+#ifdef DRS_VERBOSE
+ {
+ int rtt_ms = (1000 * rtt / HZ) >> 3;
+ int srtt_ms = (1000 * tp->srtt / HZ) >> 3;
+ int art_ms = (1000 * tp->art_low / HZ);
+ int rate = progress * 8 * 1000 / rtt_ms / 1024;
+
+ printk("BPW port %d from %u: %d bytes/%d ms (%u,%u) = %d kbps, rbuf=%u/%u seq=%u:%u\n",
+ ntohs(th->dest), ntohl(sk->daddr),
+ progress, rtt_ms, srtt_ms, art_ms, rate,
+ atomic_read(&sk->rmem_alloc), sk->rcvbuf,
+ TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq
+ );
+ }
+#endif
+ sk->rcvbuf = max(sk->rcvbuf, 2 * progress + sysctl_rmem_default);
+ sk->rcvbuf = min(sk->rcvbuf, sysctl_rmem_max);
+ if (sk->rcvbuf > tp->window_clamp) tp->window_clamp = tcp_full_space(sk);
+
+#ifdef DRS_VERBOSE
+ printk("Raised receive buffer to %d, tp->window_clamp is %d\n", sk->rcvbuf, tp->window_clamp);
+ } else {
+ printk("BPW port %d from %u: %u bytes/%u ms = NaN kbps\n",
+ th->dest, sk->daddr,
+ progress, 1000 * (rtt>>3) / HZ
+ );
+#endif
+ }
+
+ reset_bpw:
+
+ tp->bpw_time = jiffies;
+ tp->bpw_seq = tp->rcv_nxt;
+ }
+}
+
+
/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
* As additional protections, we do not touch cwnd in retransmission phases,
* and if application hit its sndbuf limit recently.
@@ -2956,7 +3098,7 @@
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- if (tp->packets_out < tp->snd_cwnd &&
+ if (
!(sk->userlocks&SOCK_SNDBUF_LOCK) &&
!tcp_memory_pressure &&
atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
@@ -2968,6 +3110,9 @@
if (sndmem > sk->sndbuf)
sk->sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
tp->snd_cwnd_stamp = tcp_time_stamp;
+#ifdef DRS_VERBOSE
+ printk("demand is %d, sndmem wanted is %d, sndbuf now %d, flightsize is %d\n", demanded, sndmem, sk->sndbuf, tp->snd_nxt - tp->snd_una);
+#endif
}
sk->write_space(sk);
@@ -3224,6 +3369,8 @@
struct tcphdr *th, unsigned len)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ tcp_auto_window(sk, skb, th, tp);
/*
* Header prediction.
diff -r -U3 kernel-source-2.4.8/net/ipv4/tcp_minisocks.c kernel-source-2.4.8.drs.clean/net/ipv4/tcp_minisocks.c
--- kernel-source-2.4.8/net/ipv4/tcp_minisocks.c Tue Aug 7 09:30:50 2001
+++ kernel-source-2.4.8.drs.clean/net/ipv4/tcp_minisocks.c Tue Oct 16 10:36:42 2001
@@ -725,6 +725,11 @@
newtp->pushed_seq = newtp->write_seq;
newtp->copied_seq = req->rcv_isn + 1;
+ newtp->art_seq = 0;
+ newtp->art_time = 0;
+ newtp->bpw_seq = 0;
+ newtp->bpw_time = 0;
+
newtp->saw_tstamp = 0;
newtp->dsack = 0;
diff -r -U3 kernel-source-2.4.8/net/ipv4/tcp_output.c kernel-source-2.4.8.drs.clean/net/ipv4/tcp_output.c
--- kernel-source-2.4.8/net/ipv4/tcp_output.c Tue Jul 10 17:11:43 2001
+++ kernel-source-2.4.8.drs.clean/net/ipv4/tcp_output.c Tue Oct 16 17:40:35 2001
@@ -54,6 +54,57 @@
tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
}
+/*
+ * Begin (if appropriate) a receive-side rtt measruement by taking
+ * a newly ACKd seqno and looking for (seqno+window), which is the
+ * data currently being cleared for transmissiont. If the sender is
+ * network bound, that data will arrive in approximately 1 rount-trip time.
+ */
+static __inline__ void tcp_ack_rtt_est(struct sock *sk, struct tcphdr *th,
+ struct tcp_opt *tp)
+{
+ int cts;
+
+ if (!tp->rcv_nxt) return;
+
+ cts = tp->rcv_nxt + (ntohs(th->window) << tp->rcv_wscale);
+
+ /* See if we're advancing the Clear To Send mark */
+ if (!tp->art_cts || after(cts, tp->art_cts)) {
+ /*
+ * If we're not performing a measurement,
+ * start one using the first new sequence number
+ * being cleared to send by this packet.
+ *
+ * ack_seq is set from rcv_nxt which is the
+ * next unreceived byte. Thus, cts is the byte
+ * just past the last one being cleared. So,
+ * the old value is the first byte being cleared now.
+ *
+ * That sequence number shouldn't arrive for
+ * at least one round-trip time.
+ */
+ if (! tp->art_time) {
+ /* Start a new measurement */
+ tp->art_time = jiffies;
+ tp->art_seq = cts; /* Look for this seqno coming back */
+#ifdef DRS_VERBOSE
+ printk("ART lport %hu from %lu: CTS = %lu (%lu + %hu) measurement\n",
+ ntohs(th->source), ntohl(sk->daddr),
+ tp->art_cts, tp->rcv_nxt, ntohs(th->window) << tp->rcv_wscale);
+#endif
+ }
+
+ /* Save this hi-water CTS */
+ tp->art_cts = cts;
+/*
+ printk("ART lport %d from %u: CTS = %u (%u + %u)\n",
+ ntohs(th->source), ntohl(sk->daddr),
+ cts, tp->rcv_nxt, ntohs(th->window) << tp->rcv_wscale);
+*/
+ }
+}
+
/* SND.NXT, if window was not shrunk.
* If window has been shrunk, what should we make? It is not clear at all.
* Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
@@ -274,6 +325,8 @@
TCP_INC_STATS(TcpOutSegs);
+ tcp_ack_rtt_est(sk, th, tp);
+
err = tp->af_specific->queue_xmit(skb);
if (err <= 0)
return err;
@@ -682,6 +735,9 @@
if (window <= free_space - mss || window > free_space)
window = (free_space/mss)*mss;
+#ifdef DRS_VERBOSE
+ printk(KERN_WARNING "Selecting window of %d (%d free, rcv_ssthresh=%d)\n", window, free_space,tp->rcv_ssthresh);
+#endif
return window;
}