#
# Copyright (C) 2000-2004 Los Alamos National Laboratory
# P.O. Box 1663, Los Alamos NM 87545 USA
#
# This file is the DRS patch to the Linux 2.4 kernel developed by the
# RADIANT team of Los Alamos National
# Laboratory.
#
# DRS is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# DRS is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# for more details.
#
# You should have received a copy of the GNU General Public License
# along with DRS; if not, write to the Free Software Foundation,
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# While this software is free according to the GNU General Public
# License, we ask that you send any bug fixes or feature enhancements
# back to radiant-software@lanl.gov for incorporation into the official
# release.
#
--- linux-2.4.23-orig/include/net/sock.h 2003-11-28 11:26:21.000000000 -0700
+++ linux-2.4.23-drs/include/net/sock.h 2004-01-28 15:23:53.000000000 -0700
@@ -432,6 +432,17 @@
__u32 frto_highmark; /* snd_nxt when RTO occurred */
unsigned long last_synq_overflow;
+
+ /*
+ * Receiver window right-sizing
+ */
+ __u32 art_cts; /* next packet wanted */
+ __u32 art_seq; /* next packet wanted */
+ __u32 art_time; /* when we sent ack that cleared art_seq */
+ __u32 art_low; /* when we sent ack that cleared art_seq */
+ __u32 bpw_seq;
+ __u32 bpw_time;
+
};
--- linux-2.4.23-orig/include/net/tcp.h 2003-11-28 11:26:21.000000000 -0700
+++ linux-2.4.23-drs/include/net/tcp.h 2004-01-28 15:25:16.000000000 -0700
@@ -1505,6 +1505,12 @@
unsigned int space = (__space < 0 ? 0 : __space);
/* If no clamp set the clamp to the max possible scaled window */
+ if (*window_clamp != 0) {
+#ifdef DRS_VERBOSE
+ printk(KERN_WARNING "Ignoring initial clamp of %d\n", *window_clamp);
+#endif
+ *window_clamp = 0;
+ }
if (*window_clamp == 0)
(*window_clamp) = (65535 << 14);
space = min(*window_clamp, space);
@@ -1522,9 +1528,18 @@
(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
(*rcv_wscale) = 0;
if (wscale_ok) {
+ /* Scale to hold max buffer size even if the current
+ * space is smaller. This allows the receive window to
+ * grow dynamically
+ */
+ __u32 maxw;
+ extern __u32 sysctl_rmem_max;
+
+ maxw = max(sysctl_tcp_rmem[2], sysctl_rmem_max);
+
/* See RFC1323 for an explanation of the limit to 14 */
- while (space > 65535 && (*rcv_wscale) < 14) {
- space >>= 1;
+ while (maxw > 65535 && (*rcv_wscale) < 14) {
+ maxw >>= 1;
(*rcv_wscale)++;
}
if (*rcv_wscale && sysctl_tcp_app_win && space>=mss &&
--- linux-2.4.23-orig/net/ipv4/tcp_input.c 2003-11-28 11:26:21.000000000 -0700
+++ linux-2.4.23-drs/net/ipv4/tcp_input.c 2004-01-28 16:19:47.000000000 -0700
@@ -70,6 +70,8 @@
#include
#include
+extern __u32 sysctl_rmem_default;
+
int sysctl_tcp_timestamps = 1;
int sysctl_tcp_window_scaling = 1;
int sysctl_tcp_sack = 1;
@@ -1788,7 +1790,13 @@
/* In "safe" area, increase. */
if (tp->snd_cwnd < tp->snd_cwnd_clamp)
tp->snd_cwnd++;
+#ifdef DRS_VERBOSE
+ else printk(KERN_WARNING "cwnd clamped at %d\n", tp->snd_cwnd_clamp);
+#endif
} else {
+#ifdef DRS_VERBOSE
+ printk(KERN_WARNING "cwnd at snd_ssthresh %d\n", tp->snd_ssthresh);
+#endif
/* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
*/
@@ -3049,6 +3056,141 @@
}
+static __inline__ __u32 difference(__u32 start, __u32 finish)
+{
+ return (__u32)((__s32)(finish - start) & 0x7fffffff);
+ /* return (finish - start); */
+}
+
+/*
+ * DRS:
+ *
+ * Receiver-side round-trip time estimation
+ *
+ * For uni-directional connections (like FTP data connections), the sender
+ * round-trip time measurements may not take place.
+ * The window tuning functions below depend on the availability of good
+ * round-trip times. So, here we implement round-trip timing based
+ * on the length of time between when we first advertise a window including
+ * sequence number X and when we receive a packet with that sequence number
+ *
+ * Unless the sender is violating standards, that cannot occur for
+ * _at least_ one round-trip time.
+ * If the sender wasn't ready to send data, the measurement may be much larger
+ * than the actual rtt. We use the measurements as an upper-bound and
+ * keep track of and use the minimum measured rtt.
+ *
+ * FLAW: If the rtt grows, we won't notice and the window will not grow
+ * accordingly
+ */
+static __inline__ void tcp_rcv_rtt_est(struct sock *sk, struct sk_buff *skb,
+ struct tcphdr *th, struct tcp_opt *tp)
+{
+ /*
+ * If no previous measurement, or the previous measurement is stale,
+ * start a new measurement
+ * Note: after() case shouldn't ever happen
+ */
+ if (! tp->art_time) {
+ return;
+ } else if (after(TCP_SKB_CB(skb)->seq, tp->art_seq)) {
+#ifdef DRS_VERBOSE
+ printk("ART orphan! (fixed)\n");
+#endif
+ tp->art_time = 0;
+ } else if (between(tp->art_seq + 1, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
+ __u32 rtt = difference(tp->art_time, jiffies);
+#ifdef DRS_VERBOSE
+ printk("ART port %hu from %lu: %lums = %lu - %lu\n",
+ ntohs(th->dest), ntohl(sk->daddr),
+ 1000 * rtt / HZ,
+ tp->lrcvtime, tp->art_time);
+
+ printk("ART port %hu from %lu: %lums, low %lums, srtt %lums\n",
+ ntohs(th->dest), ntohl(sk->daddr),
+ 1000 * rtt / HZ,
+ 1000 * tp->art_low /HZ,
+ (1000 * tp->srtt / HZ) >> 3 );
+#endif
+
+ if (!tp->art_low || (rtt < tp->art_low))
+ tp->art_low = rtt;
+
+ /* tcp_rtt_estimator(tp, rtt); */
+
+ tp->art_time = 0; /* Make a new measurement */
+ }
+}
+
+/*
+ * DRS:
+ * Automatically tune-up the receive buffer space (and consequently TCP window)
+ * to keep pace with high delay-bandwidth connections.
+ */
+static __inline__ void tcp_auto_window(struct sock *sk, struct sk_buff *skb,
+ struct tcphdr *th, struct tcp_opt *tp)
+{
+ int rtt = tp->srtt;
+
+ /* Use the srtt, or in its absence, calculate a bulk-data receive rtt */
+ if (! rtt) {
+ tcp_rcv_rtt_est(sk, skb, th, tp);
+
+ /* Use the low-water mark for bulk data transfer receive rtt,
+ scaled like srtt */
+ rtt = tp->art_low << 3;
+ }
+
+ if (!tp->bpw_time || !tp->bpw_seq) goto reset_bpw;
+
+ /*
+ * Monitor the number of bytes received per round-trip time
+ * Bytes received out of order might not be counted
+ */
+ if (after(jiffies, tp->bpw_time + (rtt>>3))) {
+ /* Calculate the number of bytes since bpw_seq. */
+ /* (not including current packet) */
+ int progress = difference(tp->bpw_seq, tp->rcv_nxt);
+
+ if (rtt && (! (sk->userlocks & SOCK_RCVBUF_LOCK))) {
+
+#ifdef DRS_VERBOSE
+ {
+ int rtt_ms = (1000 * rtt / HZ) >> 3;
+ int srtt_ms = (1000 * tp->srtt / HZ) >> 3;
+ int art_ms = (1000 * tp->art_low / HZ);
+ int rate = progress * 8 * 1000 / rtt_ms / 1024;
+
+ printk("BPW port %d from %u: %d bytes/%d ms (%u,%u) = %d kbps, rbuf=%u/%u seq=%u:%u\n",
+ ntohs(th->dest), ntohl(sk->daddr),
+ progress, rtt_ms, srtt_ms, art_ms, rate,
+ atomic_read(&sk->rmem_alloc), sk->rcvbuf,
+ TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq
+ );
+ }
+#endif
+ sk->rcvbuf = max(sk->rcvbuf, 2 * progress + sysctl_rmem_default);
+ sk->rcvbuf = min(sk->rcvbuf, sysctl_rmem_max);
+ if (sk->rcvbuf > tp->window_clamp) tp->window_clamp = tcp_full_space(sk);
+
+#ifdef DRS_VERBOSE
+ printk("Raised receive buffer to %d, tp->window_clamp is %d\n", sk->rcvbuf, tp->window_clamp);
+ } else {
+ printk("BPW port %d from %u: %u bytes/%u ms = NaN kbps\n",
+ th->dest, sk->daddr,
+ progress, 1000 * (rtt>>3) / HZ
+ );
+#endif
+ }
+
+ reset_bpw:
+
+ tp->bpw_time = jiffies;
+ tp->bpw_seq = tp->rcv_nxt;
+ }
+}
+
+
/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
* As additional protections, we do not touch cwnd in retransmission phases,
* and if application hit its sndbuf limit recently.
@@ -3079,7 +3221,7 @@
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- if (tp->packets_out < tp->snd_cwnd &&
+ if (
!(sk->userlocks&SOCK_SNDBUF_LOCK) &&
!tcp_memory_pressure &&
atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
@@ -3091,6 +3233,9 @@
if (sndmem > sk->sndbuf)
sk->sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
tp->snd_cwnd_stamp = tcp_time_stamp;
+#ifdef DRS_VERBOSE
+ printk("demand is %d, sndmem wanted is %d, sndbuf now %d, flightsize is %d\n", demanded, sndmem, sk->sndbuf, tp->snd_nxt - tp->snd_una);
+#endif
}
sk->write_space(sk);
@@ -3339,6 +3484,8 @@
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ tcp_auto_window(sk, skb, th, tp);
+
/*
* Header prediction.
* The code loosely follows the one in the famous
--- linux-2.4.23-orig/net/ipv4/tcp_minisocks.c 2003-08-25 05:44:44.000000000 -0600
+++ linux-2.4.23-drs/net/ipv4/tcp_minisocks.c 2004-01-28 15:15:39.000000000 -0700
@@ -729,6 +729,11 @@
newtp->pushed_seq = newtp->write_seq;
newtp->copied_seq = req->rcv_isn + 1;
+ newtp->art_seq = 0;
+ newtp->art_time = 0;
+ newtp->bpw_seq = 0;
+ newtp->bpw_time = 0;
+
newtp->saw_tstamp = 0;
newtp->dsack = 0;
--- linux-2.4.23-orig/net/ipv4/tcp_output.c 2003-11-28 11:26:21.000000000 -0700
+++ linux-2.4.23-drs/net/ipv4/tcp_output.c 2004-01-28 15:20:14.000000000 -0700
@@ -55,6 +55,57 @@
tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
}
+/*
+ * Begin (if appropriate) a receive-side rtt measruement by taking
+ * a newly ACKd seqno and looking for (seqno+window), which is the
+ * data currently being cleared for transmissiont. If the sender is
+ * network bound, that data will arrive in approximately 1 rount-trip time.
+ */
+static __inline__ void tcp_ack_rtt_est(struct sock *sk, struct tcphdr *th,
+ struct tcp_opt *tp)
+{
+ int cts;
+
+ if (!tp->rcv_nxt) return;
+
+ cts = tp->rcv_nxt + (ntohs(th->window) << tp->rcv_wscale);
+
+ /* See if we're advancing the Clear To Send mark */
+ if (!tp->art_cts || after(cts, tp->art_cts)) {
+ /*
+ * If we're not performing a measurement,
+ * start one using the first new sequence number
+ * being cleared to send by this packet.
+ *
+ * ack_seq is set from rcv_nxt which is the
+ * next unreceived byte. Thus, cts is the byte
+ * just past the last one being cleared. So,
+ * the old value is the first byte being cleared now.
+ *
+ * That sequence number shouldn't arrive for
+ * at least one round-trip time.
+ */
+ if (! tp->art_time) {
+ /* Start a new measurement */
+ tp->art_time = jiffies;
+ tp->art_seq = cts; /* Look for this seqno coming back */
+#ifdef DRS_VERBOSE
+ printk("ART lport %hu from %lu: CTS = %lu (%lu + %hu) measurement\n",
+ ntohs(th->source), ntohl(sk->daddr),
+ tp->art_cts, tp->rcv_nxt, ntohs(th->window) << tp->rcv_wscale);
+#endif
+ }
+
+ /* Save this hi-water CTS */
+ tp->art_cts = cts;
+/*
+ printk("ART lport %d from %u: CTS = %u (%u + %u)\n",
+ ntohs(th->source), ntohl(sk->daddr),
+ cts, tp->rcv_nxt, ntohs(th->window) << tp->rcv_wscale);
+*/
+ }
+}
+
/* SND.NXT, if window was not shrunk.
* If window has been shrunk, what should we make? It is not clear at all.
* Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
@@ -275,6 +326,8 @@
TCP_INC_STATS(TcpOutSegs);
+ tcp_ack_rtt_est(sk, th, tp);
+
err = tp->af_specific->queue_xmit(skb, 0);
if (err <= 0)
return err;
@@ -683,6 +736,9 @@
if (window <= free_space - mss || window > free_space)
window = (free_space/mss)*mss;
+#ifdef DRS_VERBOSE
+ printk(KERN_WARNING "Selecting window of %d (%d free, rcv_ssthresh=%d)\n", window, free_space,tp->rcv_ssthresh);
+#endif
return window;
}