From: John Heffner Date: Tue, 21 Mar 2006 01:53:41 +0000 (-0800) Subject: [TCP]: MTU probing X-Git-Url: http://git.cdn.openwrt.org/?a=commitdiff_plain;h=5d424d5a674f782d0659a3b66d951f412901faee;p=openwrt%2Fstaging%2Fblogic.git [TCP]: MTU probing Implementation of packetization layer path mtu discovery for TCP, based on the internet-draft currently found at . Signed-off-by: John Heffner Signed-off-by: David S. Miller --- diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 8ad4beab2888..6e8880ea49e7 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -397,6 +397,8 @@ enum NET_TCP_CONG_CONTROL=110, NET_TCP_ABC=111, NET_IPV4_IPFRAG_MAX_DIST=112, + NET_TCP_MTU_PROBING=113, + NET_TCP_BASE_MSS=114, }; enum { diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index fa587c94e9d0..b3abe33f4e5f 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -72,6 +72,7 @@ struct inet_connection_sock_af_ops { * @icsk_probes_out: unanswered 0 window probes * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options) * @icsk_ack: Delayed ACK control data + * @icsk_mtup; MTU probing control data */ struct inet_connection_sock { /* inet_sock has to be the first member! */ @@ -104,6 +105,18 @@ struct inet_connection_sock { __u16 last_seg_size; /* Size of last incoming segment */ __u16 rcv_mss; /* MSS used for delayed ACK decisions */ } icsk_ack; + struct { + int enabled; + + /* Range of MTUs to search */ + int search_high; + int search_low; + + /* Information on the current probe. */ + int probe_size; + __u32 probe_seq_start; + __u32 probe_seq_end; + } icsk_mtup; u32 icsk_ca_priv[16]; #define ICSK_CA_PRIV_SIZE (16 * sizeof(u32)) }; diff --git a/include/net/tcp.h b/include/net/tcp.h index 77f21c65bbca..16879fa560de 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -60,6 +60,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); /* Minimal RCV_MSS. */ #define TCP_MIN_RCVMSS 536U +/* The least MTU to use for probing */ +#define TCP_BASE_MSS 512 + /* After receiving this amount of duplicate ACKs fast retransmit starts. */ #define TCP_FASTRETRANS_THRESH 3 @@ -219,6 +222,8 @@ extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; extern int sysctl_tcp_abc; +extern int sysctl_tcp_mtu_probing; +extern int sysctl_tcp_base_mss; extern atomic_t tcp_memory_allocated; extern atomic_t tcp_sockets_allocated; @@ -447,6 +452,10 @@ extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, extern void tcp_initialize_rcv_mss(struct sock *sk); +extern int tcp_mtu_to_mss(struct sock *sk, int pmtu); +extern int tcp_mss_to_mtu(struct sock *sk, int mss); +extern void tcp_mtup_init(struct sock *sk); + static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) { tp->pred_flags = htonl((tp->tcp_header_len << 26) | diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 16984d4a8a06..ebf2e0b363c4 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -664,6 +664,22 @@ ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = NET_TCP_MTU_PROBING, + .procname = "tcp_mtu_probing", + .data = &sysctl_tcp_mtu_probing, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_BASE_MSS, + .procname = "tcp_base_mss", + .data = &sysctl_tcp_base_mss, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e9a54ae7d690..0ac388e3d01d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1891,6 +1891,34 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) } } +static void tcp_mtup_probe_failed(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1; + icsk->icsk_mtup.probe_size = 0; +} + +static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + /* FIXME: breaks with very large cwnd */ + tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->snd_cwnd = tp->snd_cwnd * + tcp_mss_to_mtu(sk, tp->mss_cache) / + icsk->icsk_mtup.probe_size; + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + tp->rcv_ssthresh = tcp_current_ssthresh(sk); + + icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; + icsk->icsk_mtup.probe_size = 0; + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); +} + + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2023,6 +2051,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, return; } + /* MTU probe failure: don't reduce cwnd */ + if (icsk->icsk_ca_state < TCP_CA_CWR && + icsk->icsk_mtup.probe_size && + tp->snd_una == icsk->icsk_mtup.probe_seq_start) { + tcp_mtup_probe_failed(sk); + /* Restores the reduction we did in tcp_mtup_probe() */ + tp->snd_cwnd++; + tcp_simple_retransmit(sk); + return; + } + /* Otherwise enter Recovery state */ if (IsReno(tp)) @@ -2243,6 +2282,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) tp->retrans_stamp = 0; } + /* MTU probing checks */ + if (icsk->icsk_mtup.probe_size) { + if (!after(icsk->icsk_mtup.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) { + tcp_mtup_probe_success(sk, skb); + } + } + if (sacked) { if (sacked & TCPCB_RETRANS) { if(sacked & TCPCB_SACKED_RETRANS) @@ -4101,6 +4147,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.sack_ok && sysctl_tcp_fack) tp->rx_opt.sack_ok |= 2; + tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -4211,6 +4258,7 @@ discard: if (tp->ecn_flags&TCP_ECN_OK) sock_set_flag(sk, SOCK_NO_LARGESEND); + tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -4399,6 +4447,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ tp->lsndtime = tcp_time_stamp; + tcp_mtup_init(sk); tcp_initialize_rcv_mss(sk); tcp_init_buffer_space(sk); tcp_fast_path_on(tp); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 233bdf259965..57e7a26e8213 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -900,6 +900,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; newinet->id = newtp->write_seq ^ jiffies; + tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(newsk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9f498a6c8895..8197b5e12f1f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -51,6 +51,12 @@ int sysctl_tcp_retrans_collapse = 1; */ int sysctl_tcp_tso_win_divisor = 3; +int sysctl_tcp_mtu_probing = 0; +int sysctl_tcp_base_mss = 512; + +EXPORT_SYMBOL(sysctl_tcp_mtu_probing); +EXPORT_SYMBOL(sysctl_tcp_base_mss); + static void update_send_head(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) { @@ -681,6 +687,62 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) return 0; } +/* Not accounting for SACKs here. */ +int tcp_mtu_to_mss(struct sock *sk, int pmtu) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + int mss_now; + + /* Calculate base mss without TCP options: + It is MMS_S - sizeof(tcphdr) of rfc1122 + */ + mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); + + /* Clamp it (mss_clamp does not include tcp options) */ + if (mss_now > tp->rx_opt.mss_clamp) + mss_now = tp->rx_opt.mss_clamp; + + /* Now subtract optional transport overhead */ + mss_now -= icsk->icsk_ext_hdr_len; + + /* Then reserve room for full set of TCP options and 8 bytes of data */ + if (mss_now < 48) + mss_now = 48; + + /* Now subtract TCP options size, not including SACKs */ + mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); + + return mss_now; +} + +/* Inverse of above */ +int tcp_mss_to_mtu(struct sock *sk, int mss) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + int mtu; + + mtu = mss + + tp->tcp_header_len + + icsk->icsk_ext_hdr_len + + icsk->icsk_af_ops->net_header_len; + + return mtu; +} + +void tcp_mtup_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1; + icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + + icsk->icsk_af_ops->net_header_len; + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); + icsk->icsk_mtup.probe_size = 0; +} + /* This function synchronize snd mss to current pmtu/exthdr set. tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts @@ -708,25 +770,12 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - /* Calculate base mss without TCP options: - It is MMS_S - sizeof(tcphdr) of rfc1122 - */ - int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len - - sizeof(struct tcphdr)); + int mss_now; - /* Clamp it (mss_clamp does not include tcp options) */ - if (mss_now > tp->rx_opt.mss_clamp) - mss_now = tp->rx_opt.mss_clamp; + if (icsk->icsk_mtup.search_high > pmtu) + icsk->icsk_mtup.search_high = pmtu; - /* Now subtract optional transport overhead */ - mss_now -= icsk->icsk_ext_hdr_len; - - /* Then reserve room for full set of TCP options and 8 bytes of data */ - if (mss_now < 48) - mss_now = 48; - - /* Now subtract TCP options size, not including SACKs */ - mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); + mss_now = tcp_mtu_to_mss(sk, pmtu); /* Bound mss with half of window */ if (tp->max_window && mss_now > (tp->max_window>>1)) @@ -734,6 +783,8 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; + if (icsk->icsk_mtup.enabled) + mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)); tp->mss_cache = mss_now; return mss_now; @@ -1063,6 +1114,140 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_ return 1; } +/* Create a new MTU probe if we are ready. + * Returns 0 if we should wait to probe (no cwnd available), + * 1 if a probe was sent, + * -1 otherwise */ +static int tcp_mtu_probe(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *skb, *nskb, *next; + int len; + int probe_size; + unsigned int pif; + int copy; + int mss_now; + + /* Not currently probing/verifying, + * not in recovery, + * have enough cwnd, and + * not SACKing (the variable headers throw things off) */ + if (!icsk->icsk_mtup.enabled || + icsk->icsk_mtup.probe_size || + inet_csk(sk)->icsk_ca_state != TCP_CA_Open || + tp->snd_cwnd < 11 || + tp->rx_opt.eff_sacks) + return -1; + + /* Very simple search strategy: just double the MSS. */ + mss_now = tcp_current_mss(sk, 0); + probe_size = 2*tp->mss_cache; + if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { + /* TODO: set timer for probe_converge_event */ + return -1; + } + + /* Have enough data in the send queue to probe? */ + len = 0; + if ((skb = sk->sk_send_head) == NULL) + return -1; + while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb)) + skb = skb->next; + if (len < probe_size) + return -1; + + /* Receive window check. */ + if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) { + if (tp->snd_wnd < probe_size) + return -1; + else + return 0; + } + + /* Do we need to wait to drain cwnd? */ + pif = tcp_packets_in_flight(tp); + if (pif + 2 > tp->snd_cwnd) { + /* With no packets in flight, don't stall. */ + if (pif == 0) + return -1; + else + return 0; + } + + /* We're allowed to probe. Build it now. */ + if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL) + return -1; + sk_charge_skb(sk, nskb); + + skb = sk->sk_send_head; + __skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue); + sk->sk_send_head = nskb; + + TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; + TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; + TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(nskb)->sacked = 0; + nskb->csum = 0; + if (skb->ip_summed == CHECKSUM_HW) + nskb->ip_summed = CHECKSUM_HW; + + len = 0; + while (len < probe_size) { + next = skb->next; + + copy = min_t(int, skb->len, probe_size - len); + if (nskb->ip_summed) + skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); + else + nskb->csum = skb_copy_and_csum_bits(skb, 0, + skb_put(nskb, copy), copy, nskb->csum); + + if (skb->len <= copy) { + /* We've eaten all the data from this skb. + * Throw it away. */ + TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags; + __skb_unlink(skb, &sk->sk_write_queue); + sk_stream_free_skb(sk, skb); + } else { + TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & + ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); + if (!skb_shinfo(skb)->nr_frags) { + skb_pull(skb, copy); + if (skb->ip_summed != CHECKSUM_HW) + skb->csum = csum_partial(skb->data, skb->len, 0); + } else { + __pskb_trim_head(skb, copy); + tcp_set_skb_tso_segs(sk, skb, mss_now); + } + TCP_SKB_CB(skb)->seq += copy; + } + + len += copy; + skb = next; + } + tcp_init_tso_segs(sk, nskb, nskb->len); + + /* We're ready to send. If this fails, the probe will + * be resegmented into mss-sized pieces by tcp_write_xmit(). */ + TCP_SKB_CB(nskb)->when = tcp_time_stamp; + if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { + /* Decrement cwnd here because we are sending + * effectively two packets. */ + tp->snd_cwnd--; + update_send_head(sk, tp, nskb); + + icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); + icsk->icsk_mtup.probe_seq_start = TCP_SKB_CB(nskb)->seq; + icsk->icsk_mtup.probe_seq_end = TCP_SKB_CB(nskb)->end_seq; + + return 1; + } + + return -1; +} + + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -1076,6 +1261,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) struct sk_buff *skb; unsigned int tso_segs, sent_pkts; int cwnd_quota; + int result; /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and all @@ -1085,6 +1271,14 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) return 0; sent_pkts = 0; + + /* Do MTU probing. */ + if ((result = tcp_mtu_probe(sk)) == 0) { + return 0; + } else if (result > 0) { + sent_pkts = 1; + } + while ((skb = sk->sk_send_head)) { unsigned int limit; @@ -1455,9 +1649,15 @@ void tcp_simple_retransmit(struct sock *sk) int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); unsigned int cur_mss = tcp_current_mss(sk, 0); int err; + /* Inconslusive MTU probe */ + if (icsk->icsk_mtup.probe_size) { + icsk->icsk_mtup.probe_size = 0; + } + /* Do not sent more than we queued. 1/4 is reserved for possible * copying overhead: fragmentation, tunneling, mangling etc. */ @@ -1883,6 +2083,7 @@ static void tcp_connect_init(struct sock *sk) if (tp->rx_opt.user_mss) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; tp->max_window = 0; + tcp_mtup_init(sk); tcp_sync_mss(sk, dst_mtu(dst)); if (!tp->window_clamp) @@ -2180,3 +2381,4 @@ EXPORT_SYMBOL(tcp_make_synack); EXPORT_SYMBOL(tcp_simple_retransmit); EXPORT_SYMBOL(tcp_sync_mss); EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); +EXPORT_SYMBOL(tcp_mtup_init); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index e1880959614a..7c1bde3cd6cb 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -119,8 +119,10 @@ static int tcp_orphan_retries(struct sock *sk, int alive) /* A write timeout has occurred. Process the after effects. */ static int tcp_write_timeout(struct sock *sk) { - const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); int retry_until; + int mss; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) @@ -128,25 +130,19 @@ static int tcp_write_timeout(struct sock *sk) retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; } else { if (icsk->icsk_retransmits >= sysctl_tcp_retries1) { - /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black - hole detection. :-( - - It is place to make it. It is not made. I do not want - to make it. It is disgusting. It does not work in any - case. Let me to cite the same draft, which requires for - us to implement this: - - "The one security concern raised by this memo is that ICMP black holes - are often caused by over-zealous security administrators who block - all ICMP messages. It is vitally important that those who design and - deploy security systems understand the impact of strict filtering on - upper-layer protocols. The safest web site in the world is worthless - if most TCP implementations cannot transfer data from it. It would - be far nicer to have all of the black holes fixed rather than fixing - all of the TCP implementations." - - Golden words :-). - */ + /* Black hole detection */ + if (sysctl_tcp_mtu_probing) { + if (!icsk->icsk_mtup.enabled) { + icsk->icsk_mtup.enabled = 1; + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); + } else { + mss = min(sysctl_tcp_base_mss, + tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)/2); + mss = max(mss, 68 - tp->tcp_header_len); + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); + } + } dst_negative_advice(&sk->sk_dst_cache); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ca9cf6853755..14de50380f4e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -987,6 +987,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + newnp->opt->opt_flen); + tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(newsk);