Merge branch 'accecn-protocol-patch-series'

Chia-Yu Chang says:

====================
AccECN protocol patch series

Please find the v19 AccECN protocol patch series, which covers the core
functionality of Accurate ECN, AccECN negotiation, AccECN TCP options,
and AccECN failure handling. The Accurate ECN draft can be found in
https://datatracker.ietf.org/doc/html/draft-ietf-tcpm-accurate-ecn-28, and it
will be RFC9768.

This patch series is part of the full AccECN patch series, which is available at
https://github.com/L4STeam/linux-net-next/commits/upstream_l4steam/
---
Chia-Yu Chang (3):
  tcp: accecn: AccECN option send control
  tcp: accecn: AccECN option failure handling
  tcp: accecn: try to fit AccECN option with SACK

Ilpo Järvinen (7):
  tcp: AccECN core
  tcp: accecn: AccECN negotiation
  tcp: accecn: add AccECN rx byte counters
  tcp: accecn: AccECN needs to know delivered bytes
  tcp: sack option handling improvements
  tcp: accecn: AccECN option
  tcp: accecn: AccECN option ceb/cep and ACE field multi-wrap heuristics

 Documentation/networking/ip-sysctl.rst        |  55 +-
 .../networking/net_cachelines/tcp_sock.rst    |  12 +
 include/linux/tcp.h                           |  28 +-
 include/net/netns/ipv4.h                      |   2 +
 include/net/tcp.h                             |  33 ++
 include/net/tcp_ecn.h                         | 554 +++++++++++++++++-
 include/uapi/linux/tcp.h                      |   9 +
 net/ipv4/syncookies.c                         |   4 +
 net/ipv4/sysctl_net_ipv4.c                    |  19 +
 net/ipv4/tcp.c                                |  30 +-
 net/ipv4/tcp_input.c                          | 318 +++++++++-
 net/ipv4/tcp_ipv4.c                           |   8 +-
 net/ipv4/tcp_minisocks.c                      |  40 +-
 net/ipv4/tcp_output.c                         | 239 +++++++-
 net/ipv6/syncookies.c                         |   2 +
 net/ipv6/tcp_ipv6.c                           |   1 +
 16 files changed, 1278 insertions(+), 76 deletions(-)
====================

Link: https://patch.msgid.link/20250916082434.100722-1-chia-yu.chang@nokia-bell-labs.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
This commit is contained in:
Paolo Abeni
2025-09-18 08:47:54 +02:00
16 changed files with 1279 additions and 77 deletions

View File

@@ -443,23 +443,56 @@ tcp_early_retrans - INTEGER
tcp_ecn - INTEGER
Control use of Explicit Congestion Notification (ECN) by TCP.
ECN is used only when both ends of the TCP connection indicate
support for it. This feature is useful in avoiding losses due
to congestion by allowing supporting routers to signal
congestion before having to drop packets.
ECN is used only when both ends of the TCP connection indicate support
for it. This feature is useful in avoiding losses due to congestion by
allowing supporting routers to signal congestion before having to drop
packets. A host that supports ECN both sends ECN at the IP layer and
feeds back ECN at the TCP layer. The highest variant of ECN feedback
that both peers support is chosen by the ECN negotiation (Accurate ECN,
ECN, or no ECN).
The highest negotiated variant for incoming connection requests
and the highest variant requested by outgoing connection
attempts:
===== ==================== ====================
Value Incoming connections Outgoing connections
===== ==================== ====================
0 No ECN No ECN
1 ECN ECN
2 ECN No ECN
3 AccECN AccECN
4 AccECN ECN
5 AccECN No ECN
===== ==================== ====================
Default: 2
tcp_ecn_option - INTEGER
Control Accurate ECN (AccECN) option sending when AccECN has been
successfully negotiated during handshake. Send logic inhibits
sending AccECN options regarless of this setting when no AccECN
option has been seen for the reverse direction.
Possible values are:
= =====================================================
0 Disable ECN. Neither initiate nor accept ECN.
1 Enable ECN when requested by incoming connections and
also request ECN on outgoing connection attempts.
2 Enable ECN when requested by incoming connections
but do not request ECN on outgoing connections.
= =====================================================
= ============================================================
0 Never send AccECN option. This also disables sending AccECN
option in SYN/ACK during handshake.
1 Send AccECN option sparingly according to the minimum option
rules outlined in draft-ietf-tcpm-accurate-ecn.
2 Send AccECN option on every packet whenever it fits into TCP
option space.
= ============================================================
Default: 2
tcp_ecn_option_beacon - INTEGER
Control Accurate ECN (AccECN) option sending frequency per RTT and it
takes effect only when tcp_ecn_option is set to 2.
Default: 3 (AccECN will be send at least 3 times per RTT)
tcp_ecn_fallback - BOOLEAN
If the kernel detects that ECN connection misbehaves, enable fall
back to non-ECN. Currently, this knob implements the fallback

View File

@@ -101,6 +101,18 @@ u32 prr_delivered
u32 prr_out read_mostly read_mostly tcp_rate_skb_sent,tcp_newly_delivered(tx);tcp_ack,tcp_rate_gen,tcp_clean_rtx_queue(rx)
u32 delivered read_mostly read_write tcp_rate_skb_sent, tcp_newly_delivered(tx);tcp_ack, tcp_rate_gen, tcp_clean_rtx_queue (rx)
u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx)
u32 received_ce read_mostly read_write
u32[3] received_ecn_bytes read_mostly read_write
u8:4 received_ce_pending read_mostly read_write
u32[3] delivered_ecn_bytes read_write
u8:2 syn_ect_snt write_mostly read_write
u8:2 syn_ect_rcv read_mostly read_write
u8:2 accecn_minlen write_mostly read_write
u8:2 est_ecnfield read_write
u8:2 accecn_opt_demand read_mostly read_write
u8:2 prev_ecnfield read_write
u64 accecn_opt_tstamp read_write
u8:4 accecn_fail_mode
u32 lost read_mostly tcp_ack
u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx)
u64 first_tx_mstamp read_write tcp_rate_skb_sent

View File

@@ -122,8 +122,9 @@ struct tcp_options_received {
smc_ok : 1, /* SMC seen on SYN packet */
snd_wscale : 4, /* Window scaling received from sender */
rcv_wscale : 4; /* Window scaling to send to receiver */
u8 saw_unknown:1, /* Received unknown option */
unused:7;
u8 accecn:6, /* AccECN index in header, 0=no options */
saw_unknown:1, /* Received unknown option */
unused:1;
u8 num_sacks; /* Number of SACK blocks */
u16 user_mss; /* mss requested by user in ioctl */
u16 mss_clamp; /* Maximal mss, negotiated at connection setup */
@@ -168,6 +169,11 @@ struct tcp_request_sock {
* after data-in-SYN.
*/
u8 syn_tos;
bool accecn_ok;
u8 syn_ect_snt: 2,
syn_ect_rcv: 2,
accecn_fail_mode:4;
u8 saw_accecn_opt :2;
#ifdef CONFIG_TCP_AO
u8 ao_keyid;
u8 ao_rcv_next;
@@ -270,6 +276,7 @@ struct tcp_sock {
u32 mdev_us; /* medium deviation */
u32 rtt_seq; /* sequence number to update rttvar */
u64 tcp_wstamp_ns; /* departure time for next sent data packet */
u64 accecn_opt_tstamp; /* Last AccECN option sent timestamp */
struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
struct sk_buff *highest_sack; /* skb just after the highest
* skb with SACKed bit set
@@ -287,6 +294,12 @@ struct tcp_sock {
*/
u8 nonagle : 4,/* Disable Nagle algorithm? */
rate_app_limited:1; /* rate_{delivered,interval_us} limited? */
u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */
unused2:4;
u8 accecn_minlen:2,/* Minimum length of AccECN option sent */
est_ecnfield:2,/* ECN field for AccECN delivered estimates */
accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */
prev_ecnfield:2; /* ECN bits from the previous segment */
__be32 pred_flags;
u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
u64 tcp_mstamp; /* most recent packet received/sent */
@@ -299,6 +312,11 @@ struct tcp_sock {
u32 snd_up; /* Urgent pointer */
u32 delivered; /* Total data packets delivered incl. rexmits */
u32 delivered_ce; /* Like the above but only ECE marked packets */
u32 received_ce; /* Like the above but for rcvd CE marked pkts */
u32 received_ecn_bytes[3]; /* received byte counters for three ECN
* types: INET_ECN_ECT_1, INET_ECN_ECT_0,
* and INET_ECN_CE
*/
u32 app_limited; /* limited until "delivered" reaches this val */
u32 rcv_wnd; /* Current receiver window */
/*
@@ -326,6 +344,7 @@ struct tcp_sock {
u32 rate_delivered; /* saved rate sample: packets delivered */
u32 rate_interval_us; /* saved rate sample: time elapsed */
u32 rcv_rtt_last_tsecr;
u32 delivered_ecn_bytes[3];
u64 first_tx_mstamp; /* start of window send phase */
u64 delivered_mstamp; /* time we reached "delivered" */
u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked
@@ -372,7 +391,8 @@ struct tcp_sock {
u8 compressed_ack;
u8 dup_ack_counter:2,
tlp_retrans:1, /* TLP is a retransmission */
unused:5;
syn_ect_snt:2, /* AccECN ECT memory, only */
syn_ect_rcv:2; /* ... needed during 3WHS + first seqno */
u8 thin_lto : 1,/* Use linear timeouts for thin streams */
fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
@@ -388,6 +408,8 @@ struct tcp_sock {
syn_fastopen_child:1; /* created TFO passive child socket */
u8 keepalive_probes; /* num of allowed keep alive probes */
u8 accecn_fail_mode:4, /* AccECN failure handling */
saw_accecn_opt:2; /* An AccECN option was seen */
u32 tcp_tx_delay; /* delay (in usec) added to TX packets */
/* RTT measurement */

View File

@@ -148,6 +148,8 @@ struct netns_ipv4 {
struct local_ports ip_local_ports;
u8 sysctl_tcp_ecn;
u8 sysctl_tcp_ecn_option;
u8 sysctl_tcp_ecn_option_beacon;
u8 sysctl_tcp_ecn_fallback;
u8 sysctl_ip_default_ttl;

View File

@@ -100,6 +100,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
/* Maximal number of window scale according to RFC1323 */
#define TCP_MAX_WSCALE 14U
/* Default sending frequency of accurate ECN option per RTT */
#define TCP_ACCECN_OPTION_BEACON 3
/* urg_data states */
#define TCP_URG_VALID 0x0100
#define TCP_URG_NOTYET 0x0200
@@ -213,6 +216,8 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define TCPOPT_AO 29 /* Authentication Option (RFC5925) */
#define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */
#define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */
#define TCPOPT_ACCECN0 172 /* 0xAC: Accurate ECN Order 0 */
#define TCPOPT_ACCECN1 174 /* 0xAE: Accurate ECN Order 1 */
#define TCPOPT_EXP 254 /* Experimental */
/* Magic number to be after the option value for sharing TCP
* experimental options. See draft-ietf-tcpm-experimental-options-00.txt
@@ -230,6 +235,7 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define TCPOLEN_TIMESTAMP 10
#define TCPOLEN_MD5SIG 18
#define TCPOLEN_FASTOPEN_BASE 2
#define TCPOLEN_ACCECN_BASE 2
#define TCPOLEN_EXP_FASTOPEN_BASE 4
#define TCPOLEN_EXP_SMC_BASE 6
@@ -243,6 +249,14 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define TCPOLEN_MD5SIG_ALIGNED 20
#define TCPOLEN_MSS_ALIGNED 4
#define TCPOLEN_EXP_SMC_BASE_ALIGNED 8
#define TCPOLEN_ACCECN_PERFIELD 3
/* Maximum number of byte counters in AccECN option + size */
#define TCP_ACCECN_NUMFIELDS 3
#define TCP_ACCECN_MAXSIZE (TCPOLEN_ACCECN_BASE + \
TCPOLEN_ACCECN_PERFIELD * \
TCP_ACCECN_NUMFIELDS)
#define TCP_ACCECN_SAFETY_SHIFT 1 /* SAFETY_FACTOR in accecn draft */
/* Flags in tp->nonagle */
#define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */
@@ -972,6 +986,18 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
#define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)
#define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR)
#define TCPHDR_SYNACK_ACCECN (TCPHDR_SYN | TCPHDR_ACK | TCPHDR_CWR)
#define TCP_ACCECN_CEP_ACE_MASK 0x7
#define TCP_ACCECN_ACE_MAX_DELTA 6
/* To avoid/detect middlebox interference, not all counters start at 0.
* See draft-ietf-tcpm-accurate-ecn for the latest values.
*/
#define TCP_ACCECN_CEP_INIT_OFFSET 5
#define TCP_ACCECN_E1B_INIT_OFFSET 1
#define TCP_ACCECN_E0B_INIT_OFFSET 1
#define TCP_ACCECN_CEB_INIT_OFFSET 0
/* State flags for sacked in struct tcp_skb_cb */
enum tcp_skb_cb_sacked_flags {
@@ -1782,11 +1808,18 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt,
static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
u32 ace;
/* mptcp hooks are only on the slow path */
if (sk_is_mptcp((struct sock *)tp))
return;
ace = tcp_ecn_mode_accecn(tp) ?
((tp->delivered_ce + TCP_ACCECN_CEP_INIT_OFFSET) &
TCP_ACCECN_CEP_ACE_MASK) : 0;
tp->pred_flags = htonl((tp->tcp_header_len << 26) |
(ace << 22) |
ntohl(TCP_FLAG_ACK) |
snd_wnd);
}

View File

@@ -4,14 +4,36 @@
#include <linux/tcp.h>
#include <linux/skbuff.h>
#include <linux/bitfield.h>
#include <net/inet_connection_sock.h>
#include <net/sock.h>
#include <net/tcp.h>
#include <net/inet_ecn.h>
/* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is
* attemped to be negotiated and requested for incoming connection
* and outgoing connection, respectively.
*/
enum tcp_ecn_mode {
TCP_ECN_IN_NOECN_OUT_NOECN = 0,
TCP_ECN_IN_ECN_OUT_ECN = 1,
TCP_ECN_IN_ECN_OUT_NOECN = 2,
TCP_ECN_IN_ACCECN_OUT_ACCECN = 3,
TCP_ECN_IN_ACCECN_OUT_ECN = 4,
TCP_ECN_IN_ACCECN_OUT_NOECN = 5,
};
/* AccECN option sending when AccECN has been successfully negotiated */
enum tcp_accecn_option {
TCP_ACCECN_OPTION_DISABLED = 0,
TCP_ACCECN_OPTION_MINIMUM = 1,
TCP_ACCECN_OPTION_FULL = 2,
};
static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp)
{
/* Do not set CWR if in AccECN mode! */
if (tcp_ecn_mode_rfc3168(tp))
tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}
@@ -19,8 +41,10 @@ static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp)
static inline void tcp_ecn_accept_cwr(struct sock *sk,
const struct sk_buff *skb)
{
if (tcp_hdr(skb)->cwr) {
tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) {
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
/* If the sender is telling us it has entered CWR, then its
* cwnd may be very low (even just 1 packet), so we should ACK
@@ -36,16 +60,485 @@ static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
}
static inline void tcp_ecn_rcv_synack(struct tcp_sock *tp,
const struct tcphdr *th)
/* tp->accecn_fail_mode */
#define TCP_ACCECN_ACE_FAIL_SEND BIT(0)
#define TCP_ACCECN_ACE_FAIL_RECV BIT(1)
#define TCP_ACCECN_OPT_FAIL_SEND BIT(2)
#define TCP_ACCECN_OPT_FAIL_RECV BIT(3)
static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp)
{
if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr))
tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND;
}
static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp,
const struct tcphdr *th)
static inline bool tcp_accecn_ace_fail_recv(const struct tcp_sock *tp)
{
return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV;
}
static inline bool tcp_accecn_opt_fail_send(const struct tcp_sock *tp)
{
return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_SEND;
}
static inline bool tcp_accecn_opt_fail_recv(const struct tcp_sock *tp)
{
return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_RECV;
}
static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode)
{
tp->accecn_fail_mode |= mode;
}
#define TCP_ACCECN_OPT_NOT_SEEN 0x0
#define TCP_ACCECN_OPT_EMPTY_SEEN 0x1
#define TCP_ACCECN_OPT_COUNTER_SEEN 0x2
#define TCP_ACCECN_OPT_FAIL_SEEN 0x3
static inline u8 tcp_accecn_ace(const struct tcphdr *th)
{
return (th->ae << 2) | (th->cwr << 1) | th->ece;
}
/* Infer the ECT value our SYN arrived with from the echoed ACE field */
static inline int tcp_accecn_extract_syn_ect(u8 ace)
{
/* Below is an excerpt from the 1st block of Table 2 of AccECN spec */
static const int ace_to_ecn[8] = {
INET_ECN_ECT_0, /* 0b000 (Undefined) */
INET_ECN_ECT_1, /* 0b001 (Undefined) */
INET_ECN_NOT_ECT, /* 0b010 (Not-ECT is received) */
INET_ECN_ECT_1, /* 0b011 (ECT-1 is received) */
INET_ECN_ECT_0, /* 0b100 (ECT-0 is received) */
INET_ECN_ECT_1, /* 0b101 (Reserved) */
INET_ECN_CE, /* 0b110 (CE is received) */
INET_ECN_ECT_1 /* 0b111 (Undefined) */
};
return ace_to_ecn[ace & 0x7];
}
/* Check ECN field transition to detect invalid transitions */
static inline bool tcp_ect_transition_valid(u8 snt, u8 rcv)
{
if (rcv == snt)
return true;
/* Non-ECT altered to something or something became non-ECT */
if (snt == INET_ECN_NOT_ECT || rcv == INET_ECN_NOT_ECT)
return false;
/* CE -> ECT(0/1)? */
if (snt == INET_ECN_CE)
return false;
return true;
}
static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace,
u8 sent_ect)
{
u8 ect = tcp_accecn_extract_syn_ect(ace);
struct tcp_sock *tp = tcp_sk(sk);
if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
return true;
if (!tcp_ect_transition_valid(sent_ect, ect)) {
tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV);
return false;
}
return true;
}
static inline void tcp_accecn_saw_opt_fail_recv(struct tcp_sock *tp,
u8 saw_opt)
{
tp->saw_accecn_opt = saw_opt;
if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN)
tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV);
}
/* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */
static inline void tcp_accecn_third_ack(struct sock *sk,
const struct sk_buff *skb, u8 sent_ect)
{
u8 ace = tcp_accecn_ace(tcp_hdr(skb));
struct tcp_sock *tp = tcp_sk(sk);
switch (ace) {
case 0x0:
/* Invalid value */
tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV);
break;
case 0x7:
case 0x5:
case 0x1:
/* Unused but legal values */
break;
default:
/* Validation only applies to first non-data packet */
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
!TCP_SKB_CB(skb)->sacked &&
tcp_accecn_validate_syn_feedback(sk, ace, sent_ect)) {
if ((tcp_accecn_extract_syn_ect(ace) == INET_ECN_CE) &&
!tp->delivered_ce)
tp->delivered_ce++;
}
break;
}
}
/* Demand the minimum # to send AccECN optnio */
static inline void tcp_accecn_opt_demand_min(struct sock *sk,
u8 opt_demand_min)
{
struct tcp_sock *tp = tcp_sk(sk);
u8 opt_demand;
opt_demand = max_t(u8, opt_demand_min, tp->accecn_opt_demand);
tp->accecn_opt_demand = opt_demand;
}
/* Maps IP ECN field ECT/CE code point to AccECN option field number, given
* we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0).
*/
static inline u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield)
{
switch (ecnfield & INET_ECN_MASK) {
case INET_ECN_NOT_ECT:
return 0; /* AccECN does not send counts of NOT_ECT */
case INET_ECN_ECT_1:
return 1;
case INET_ECN_CE:
return 2;
case INET_ECN_ECT_0:
return 3;
}
return 0;
}
/* Maps IP ECN field ECT/CE code point to AccECN option field value offset.
* Some fields do not start from zero, to detect zeroing by middleboxes.
*/
static inline u32 tcp_accecn_field_init_offset(u8 ecnfield)
{
switch (ecnfield & INET_ECN_MASK) {
case INET_ECN_NOT_ECT:
return 0; /* AccECN does not send counts of NOT_ECT */
case INET_ECN_ECT_1:
return TCP_ACCECN_E1B_INIT_OFFSET;
case INET_ECN_CE:
return TCP_ACCECN_CEB_INIT_OFFSET;
case INET_ECN_ECT_0:
return TCP_ACCECN_E0B_INIT_OFFSET;
}
return 0;
}
/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */
static inline unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int option,
bool order)
{
/* Based on Table 5 of the AccECN spec to map (option, order) to
* the corresponding ECN conuters (ECT-1, ECT-0, or CE).
*/
static const u8 optfield_lookup[2][3] = {
/* order = 0: 1st field ECT-0, 2nd field CE, 3rd field ECT-1 */
{ INET_ECN_ECT_0, INET_ECN_CE, INET_ECN_ECT_1 },
/* order = 1: 1st field ECT-1, 2nd field CE, 3rd field ECT-0 */
{ INET_ECN_ECT_1, INET_ECN_CE, INET_ECN_ECT_0 }
};
return optfield_lookup[order][option % 3];
}
/* Handles AccECN option ECT and CE 24-bit byte counters update into
* the u32 value in tcp_sock. As we're processing TCP options, it is
* safe to access from - 1.
*/
static inline s32 tcp_update_ecn_bytes(u32 *cnt, const char *from,
u32 init_offset)
{
u32 truncated = (get_unaligned_be32(from - 1) - init_offset) &
0xFFFFFFU;
u32 delta = (truncated - *cnt) & 0xFFFFFFU;
/* If delta has the highest bit set (24th bit) indicating
* negative, sign extend to correct an estimation using
* sign_extend32(delta, 24 - 1)
*/
delta = sign_extend32(delta, 23);
*cnt += delta;
return (s32)delta;
}
/* Updates Accurate ECN received counters from the received IP ECN field */
static inline void tcp_ecn_received_counters(struct sock *sk,
const struct sk_buff *skb, u32 len)
{
u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK;
u8 is_ce = INET_ECN_is_ce(ecnfield);
struct tcp_sock *tp = tcp_sk(sk);
bool ecn_edge;
if (!INET_ECN_is_not_ect(ecnfield)) {
u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs);
/* As for accurate ECN, the TCP_ECN_SEEN flag is set by
* tcp_ecn_received_counters() when the ECN codepoint of
* received TCP data or ACK contains ECT(0), ECT(1), or CE.
*/
if (!tcp_ecn_mode_rfc3168(tp))
tp->ecn_flags |= TCP_ECN_SEEN;
/* ACE counter tracks *all* segments including pure ACKs */
tp->received_ce += pcount;
tp->received_ce_pending = min(tp->received_ce_pending + pcount,
0xfU);
if (len > 0) {
u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield);
u32 oldbytes = tp->received_ecn_bytes[ecnfield - 1];
u32 bytes_mask = GENMASK_U32(31, 22);
tp->received_ecn_bytes[ecnfield - 1] += len;
tp->accecn_minlen = max_t(u8, tp->accecn_minlen,
minlen);
/* Send AccECN option at least once per 2^22-byte
* increase in any ECN byte counter.
*/
if ((tp->received_ecn_bytes[ecnfield - 1] ^ oldbytes) &
bytes_mask) {
tcp_accecn_opt_demand_min(sk, 1);
}
}
}
ecn_edge = tp->prev_ecnfield != ecnfield;
if (ecn_edge || is_ce) {
tp->prev_ecnfield = ecnfield;
/* Demand Accurate ECN change-triggered ACKs. Two ACK are
* demanded to indicate unambiguously the ecnfield value
* in the latter ACK.
*/
if (tcp_ecn_mode_accecn(tp)) {
if (ecn_edge)
inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
tp->accecn_opt_demand = 2;
}
}
}
/* AccECN specification, 2.2: [...] A Data Receiver maintains four counters
* initialized at the start of the half-connection. [...] These byte counters
* reflect only the TCP payload length, excluding TCP header and TCP options.
*/
static inline void tcp_ecn_received_counters_payload(struct sock *sk,
const struct sk_buff *skb)
{
const struct tcphdr *th = (const struct tcphdr *)skb->data;
tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4);
}
/* AccECN specification, 5.1: [...] a server can determine that it
* negotiated AccECN as [...] if the ACK contains an ACE field with
* the value 0b010 to 0b111 (decimal 2 to 7).
*/
static inline bool cookie_accecn_ok(const struct tcphdr *th)
{
return tcp_accecn_ace(th) > 0x1;
}
/* Used to form the ACE flags for SYN/ACK */
static inline u16 tcp_accecn_reflector_flags(u8 ect)
{
/* TCP ACE flags of SYN/ACK are set based on IP-ECN received from SYN.
* Below is an excerpt from the 1st block of Table 2 of AccECN spec,
* in which TCP ACE flags are encoded as: (AE << 2) | (CWR << 1) | ECE
*/
static const u8 ecn_to_ace_flags[4] = {
0b010, /* Not-ECT is received */
0b011, /* ECT(1) is received */
0b100, /* ECT(0) is received */
0b110 /* CE is received */
};
return FIELD_PREP(TCPHDR_ACE, ecn_to_ace_flags[ect & 0x3]);
}
/* AccECN specification, 3.1.2: If a TCP server that implements AccECN
* receives a SYN with the three TCP header flags (AE, CWR and ECE) set
* to any combination other than 000, 011 or 111, it MUST negotiate the
* use of AccECN as if they had been set to 111.
*/
static inline bool tcp_accecn_syn_requested(const struct tcphdr *th)
{
u8 ace = tcp_accecn_ace(th);
return ace && ace != 0x3;
}
static inline void __tcp_accecn_init_bytes_counters(int *counter_array)
{
BUILD_BUG_ON(INET_ECN_ECT_1 != 0x1);
BUILD_BUG_ON(INET_ECN_ECT_0 != 0x2);
BUILD_BUG_ON(INET_ECN_CE != 0x3);
counter_array[INET_ECN_ECT_1 - 1] = 0;
counter_array[INET_ECN_ECT_0 - 1] = 0;
counter_array[INET_ECN_CE - 1] = 0;
}
static inline void tcp_accecn_init_counters(struct tcp_sock *tp)
{
tp->received_ce = 0;
tp->received_ce_pending = 0;
__tcp_accecn_init_bytes_counters(tp->received_ecn_bytes);
__tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes);
tp->accecn_minlen = 0;
tp->accecn_opt_demand = 0;
tp->est_ecnfield = 0;
}
/* Used for make_synack to form the ACE flags */
static inline void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect)
{
/* TCP ACE flags of SYN/ACK are set based on IP-ECN codepoint received
* from SYN. Below is an excerpt from Table 2 of the AccECN spec:
* +====================+====================================+
* | IP-ECN codepoint | Respective ACE falgs on SYN/ACK |
* | received on SYN | AE CWR ECE |
* +====================+====================================+
* | Not-ECT | 0 1 0 |
* | ECT(1) | 0 1 1 |
* | ECT(0) | 1 0 0 |
* | CE | 1 1 0 |
* +====================+====================================+
*/
th->ae = !!(ect & INET_ECN_ECT_0);
th->cwr = ect != INET_ECN_ECT_0;
th->ece = ect == INET_ECN_ECT_1;
}
static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb,
struct tcphdr *th)
{
u32 wire_ace;
/* The final packet of the 3WHS or anything like it must reflect
* the SYN/ACK ECT instead of putting CEP into ACE field, such
* case show up in tcp_flags.
*/
if (likely(!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACE))) {
wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET;
th->ece = !!(wire_ace & 0x1);
th->cwr = !!(wire_ace & 0x2);
th->ae = !!(wire_ace & 0x4);
tp->received_ce_pending = 0;
}
}
static inline u8 tcp_accecn_option_init(const struct sk_buff *skb,
u8 opt_offset)
{
u8 *ptr = skb_transport_header(skb) + opt_offset;
unsigned int optlen = ptr[1] - 2;
if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1))
return TCP_ACCECN_OPT_FAIL_SEEN;
ptr += 2;
/* Detect option zeroing: an AccECN connection "MAY check that the
* initial value of the EE0B field or the EE1B field is non-zero"
*/
if (optlen < TCPOLEN_ACCECN_PERFIELD)
return TCP_ACCECN_OPT_EMPTY_SEEN;
if (get_unaligned_be24(ptr) == 0)
return TCP_ACCECN_OPT_FAIL_SEEN;
if (optlen < TCPOLEN_ACCECN_PERFIELD * 3)
return TCP_ACCECN_OPT_COUNTER_SEEN;
ptr += TCPOLEN_ACCECN_PERFIELD * 2;
if (get_unaligned_be24(ptr) == 0)
return TCP_ACCECN_OPT_FAIL_SEEN;
return TCP_ACCECN_OPT_COUNTER_SEEN;
}
/* See Table 2 of the AccECN draft */
static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb,
const struct tcphdr *th, u8 ip_dsfield)
{
struct tcp_sock *tp = tcp_sk(sk);
u8 ace = tcp_accecn_ace(th);
switch (ace) {
case 0x0:
case 0x7:
/* +========+========+============+=============+
* | A | B | SYN/ACK | Feedback |
* | | | B->A | Mode of A |
* | | | AE CWR ECE | |
* +========+========+============+=============+
* | AccECN | No ECN | 0 0 0 | Not ECN |
* | AccECN | Broken | 1 1 1 | Not ECN |
* +========+========+============+=============+
*/
tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
break;
case 0x1:
case 0x5:
/* +========+========+============+=============+
* | A | B | SYN/ACK | Feedback |
* | | | B->A | Mode of A |
* | | | AE CWR ECE | |
* +========+========+============+=============+
* | AccECN | Nonce | 1 0 1 | (Reserved) |
* | AccECN | ECN | 0 0 1 | Classic ECN |
* | Nonce | AccECN | 0 0 1 | Classic ECN |
* | ECN | AccECN | 0 0 1 | Classic ECN |
* +========+========+============+=============+
*/
if (tcp_ecn_mode_pending(tp))
/* Downgrade from AccECN, or requested initially */
tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
break;
default:
tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK;
if (tp->rx_opt.accecn &&
tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn);
tcp_accecn_saw_opt_fail_recv(tp, saw_opt);
tp->accecn_opt_demand = 2;
}
if (INET_ECN_is_ce(ip_dsfield) &&
tcp_accecn_validate_syn_feedback(sk, ace,
tp->syn_ect_snt)) {
tp->received_ce++;
tp->received_ce_pending++;
}
break;
}
}
static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th,
const struct sk_buff *skb)
{
if (tcp_ecn_mode_pending(tp)) {
if (!tcp_accecn_syn_requested(th)) {
/* Downgrade to classic ECN feedback */
tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
} else {
tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
INET_ECN_MASK;
tp->prev_ecnfield = tp->syn_ect_rcv;
tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
}
}
if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr))
tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
}
@@ -61,7 +554,7 @@ static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp,
/* Packet ECN state for a SYN-ACK */
static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
if (tcp_ecn_disabled(tp))
@@ -69,6 +562,13 @@ static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
else if (tcp_ca_needs_ecn(sk) ||
tcp_bpf_ca_needs_ecn(sk))
INET_ECN_xmit(sk);
if (tp->ecn_flags & TCP_ECN_MODE_ACCECN) {
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE;
TCP_SKB_CB(skb)->tcp_flags |=
tcp_accecn_reflector_flags(tp->syn_ect_rcv);
tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
}
}
/* Packet ECN state for a SYN. */
@@ -76,8 +576,13 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
bool use_ecn, use_accecn;
u8 tcp_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn);
use_accecn = tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ACCECN;
use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN ||
tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN ||
tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn;
if (!use_ecn) {
const struct dst_entry *dst = __sk_dst_get(sk);
@@ -93,24 +598,45 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
INET_ECN_xmit(sk);
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
if (use_accecn) {
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_AE;
tcp_ecn_mode_set(tp, TCP_ECN_MODE_PENDING);
tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
} else {
tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
}
}
}
static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
{
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) {
/* tp->ecn_flags are cleared at a later point in time when
* SYN ACK is ultimatively being received.
*/
TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE;
}
}
static inline void
tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
{
if (inet_rsk(req)->ecn_ok)
if (tcp_rsk(req)->accecn_ok)
tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv);
else if (inet_rsk(req)->ecn_ok)
th->ece = 1;
}
static inline bool tcp_accecn_option_beacon_check(const struct sock *sk)
{
u32 ecn_beacon = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon);
const struct tcp_sock *tp = tcp_sk(sk);
if (!ecn_beacon)
return false;
return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) * ecn_beacon >=
(tp->srtt_us >> 3);
}
#endif /* _LINUX_TCP_ECN_H */

View File

@@ -316,6 +316,15 @@ struct tcp_info {
* in milliseconds, including any
* unfinished recovery.
*/
__u32 tcpi_received_ce; /* # of CE marks received */
__u32 tcpi_delivered_e1_bytes; /* Accurate ECN byte counters */
__u32 tcpi_delivered_e0_bytes;
__u32 tcpi_delivered_ce_bytes;
__u32 tcpi_received_e1_bytes;
__u32 tcpi_received_e0_bytes;
__u32 tcpi_received_ce_bytes;
__u16 tcpi_accecn_fail_mode;
__u16 tcpi_accecn_opt_seen;
};
/* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */

View File

@@ -12,6 +12,7 @@
#include <linux/export.h>
#include <net/secure_seq.h>
#include <net/tcp.h>
#include <net/tcp_ecn.h>
#include <net/route.h>
static siphash_aligned_key_t syncookie_secret[2];
@@ -403,6 +404,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
struct inet_request_sock *ireq;
struct net *net = sock_net(sk);
struct tcp_request_sock *treq;
struct request_sock *req;
struct sock *ret = sk;
struct flowi4 fl4;
@@ -428,6 +430,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
}
ireq = inet_rsk(req);
treq = tcp_rsk(req);
sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
@@ -483,6 +486,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
if (!req->syncookie)
ireq->rcv_wscale = rcv_wscale;
ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst);
treq->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th);
ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst);
/* ip_queue_xmit() depends on our flow being setup

View File

@@ -47,6 +47,7 @@ static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
static int tcp_plb_max_rounds = 31;
static int tcp_plb_max_cong_thresh = 256;
static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;
static int tcp_ecn_mode_max = 2;
/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
@@ -728,8 +729,26 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = &tcp_ecn_mode_max,
},
{
.procname = "tcp_ecn_option",
.data = &init_net.ipv4.sysctl_tcp_ecn_option,
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO,
},
{
.procname = "tcp_ecn_option_beacon",
.data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon,
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_THREE,
},
{
.procname = "tcp_ecn_fallback",
.data = &init_net.ipv4.sysctl_tcp_ecn_fallback,

View File

@@ -270,7 +270,9 @@
#include <net/icmp.h>
#include <net/inet_common.h>
#include <net/inet_ecn.h>
#include <net/tcp.h>
#include <net/tcp_ecn.h>
#include <net/mptcp.h>
#include <net/proto_memory.h>
#include <net/xfrm.h>
@@ -3406,6 +3408,11 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->window_clamp = 0;
tp->delivered = 0;
tp->delivered_ce = 0;
tp->accecn_fail_mode = 0;
tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN;
tcp_accecn_init_counters(tp);
tp->prev_ecnfield = 0;
tp->accecn_opt_tstamp = 0;
if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release)
icsk->icsk_ca_ops->release(sk);
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
@@ -4152,6 +4159,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
const u8 ect1_idx = INET_ECN_ECT_1 - 1;
const u8 ect0_idx = INET_ECN_ECT_0 - 1;
const u8 ce_idx = INET_ECN_CE - 1;
unsigned long rate;
u32 now;
u64 rate64;
@@ -4278,6 +4288,16 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
if (tp->rto_stamp)
info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp;
info->tcpi_accecn_fail_mode = tp->accecn_fail_mode;
info->tcpi_accecn_opt_seen = tp->saw_accecn_opt;
info->tcpi_received_ce = tp->received_ce;
info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx];
info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx];
info->tcpi_delivered_ce_bytes = tp->delivered_ecn_bytes[ce_idx];
info->tcpi_received_e1_bytes = tp->received_ecn_bytes[ect1_idx];
info->tcpi_received_e0_bytes = tp->received_ecn_bytes[ect0_idx];
info->tcpi_received_ce_bytes = tp->received_ecn_bytes[ce_idx];
unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -5119,11 +5139,12 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags);
CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89);
CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 97);
/* TXRX read-write hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags);
@@ -5138,6 +5159,8 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt);
@@ -5145,7 +5168,7 @@ static void __init tcp_struct_check(void)
/* 32bit arches with 8byte alignment on u64 fields might need padding
* before tcp_clock_cache.
*/
CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 91 + 4);
CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 107 + 4);
/* RX read-write hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received);
@@ -5157,12 +5180,13 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space);
CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 96);
CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 112);
}
void __init tcp_init(void)

View File

@@ -70,6 +70,7 @@
#include <linux/sysctl.h>
#include <linux/kernel.h>
#include <linux/prefetch.h>
#include <linux/bitops.h>
#include <net/dst.h>
#include <net/tcp.h>
#include <net/tcp_ecn.h>
@@ -360,21 +361,119 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
if (tcp_ca_needs_ecn(sk))
tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) &&
tcp_ecn_mode_rfc3168(tp)) {
/* Better not delay acks, sender can have a very low cwnd */
tcp_enter_quickack_mode(sk, 2);
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
}
/* As for RFC3168 ECN, the TCP_ECN_SEEN flag is set by
* tcp_data_ecn_check() when the ECN codepoint of
* received TCP data contains ECT(0), ECT(1), or CE.
*/
if (!tcp_ecn_mode_rfc3168(tp))
break;
tp->ecn_flags |= TCP_ECN_SEEN;
break;
default:
if (tcp_ca_needs_ecn(sk))
tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
if (!tcp_ecn_mode_rfc3168(tp))
break;
tp->ecn_flags |= TCP_ECN_SEEN;
break;
}
}
/* Returns true if the byte counters can be used */
static bool tcp_accecn_process_option(struct tcp_sock *tp,
const struct sk_buff *skb,
u32 delivered_bytes, int flag)
{
u8 estimate_ecnfield = tp->est_ecnfield;
bool ambiguous_ecn_bytes_incr = false;
bool first_changed = false;
unsigned int optlen;
bool order1, res;
unsigned int i;
u8 *ptr;
if (tcp_accecn_opt_fail_recv(tp))
return false;
if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) {
if (!tp->saw_accecn_opt) {
/* Too late to enable after this point due to
* potential counter wraps
*/
if (tp->bytes_sent >= (1 << 23) - 1) {
u8 saw_opt = TCP_ACCECN_OPT_FAIL_SEEN;
tcp_accecn_saw_opt_fail_recv(tp, saw_opt);
}
return false;
}
if (estimate_ecnfield) {
u8 ecnfield = estimate_ecnfield - 1;
tp->delivered_ecn_bytes[ecnfield] += delivered_bytes;
return true;
}
return false;
}
ptr = skb_transport_header(skb) + tp->rx_opt.accecn;
optlen = ptr[1] - 2;
if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1))
return false;
order1 = (ptr[0] == TCPOPT_ACCECN1);
ptr += 2;
if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
tp->saw_accecn_opt = tcp_accecn_option_init(skb,
tp->rx_opt.accecn);
if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN)
tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV);
}
res = !!estimate_ecnfield;
for (i = 0; i < 3; i++) {
u32 init_offset;
u8 ecnfield;
s32 delta;
u32 *cnt;
if (optlen < TCPOLEN_ACCECN_PERFIELD)
break;
ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1);
init_offset = tcp_accecn_field_init_offset(ecnfield);
cnt = &tp->delivered_ecn_bytes[ecnfield - 1];
delta = tcp_update_ecn_bytes(cnt, ptr, init_offset);
if (delta && delta < 0) {
res = false;
ambiguous_ecn_bytes_incr = true;
}
if (delta && ecnfield != estimate_ecnfield) {
if (!first_changed) {
tp->est_ecnfield = ecnfield;
first_changed = true;
} else {
res = false;
ambiguous_ecn_bytes_incr = true;
}
}
optlen -= TCPOLEN_ACCECN_PERFIELD;
ptr += TCPOLEN_ACCECN_PERFIELD;
}
if (ambiguous_ecn_bytes_incr)
tp->est_ecnfield = 0;
return res;
}
static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count)
{
tp->delivered_ce += ecn_count;
@@ -385,10 +484,101 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
bool ece_ack)
{
tp->delivered += delivered;
if (ece_ack)
if (tcp_ecn_mode_rfc3168(tp) && ece_ack)
tcp_count_delivered_ce(tp, delivered);
}
/* Returns the ECN CE delta */
static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
u32 delivered_pkts, u32 delivered_bytes,
int flag)
{
u32 old_ceb = tcp_sk(sk)->delivered_ecn_bytes[INET_ECN_CE - 1];
const struct tcphdr *th = tcp_hdr(skb);
struct tcp_sock *tp = tcp_sk(sk);
u32 delta, safe_delta, d_ceb;
bool opt_deltas_valid;
u32 corrected_ace;
/* Reordered ACK or uncertain due to lack of data to send and ts */
if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS)))
return 0;
opt_deltas_valid = tcp_accecn_process_option(tp, skb,
delivered_bytes, flag);
if (!(flag & FLAG_SLOWPATH)) {
/* AccECN counter might overflow on large ACKs */
if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK)
return 0;
}
/* ACE field is not available during handshake */
if (flag & FLAG_SYN_ACKED)
return 0;
if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA)
inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
corrected_ace = tcp_accecn_ace(th) - TCP_ACCECN_CEP_INIT_OFFSET;
delta = (corrected_ace - tp->delivered_ce) & TCP_ACCECN_CEP_ACE_MASK;
if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK)
return delta;
safe_delta = delivered_pkts -
((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK);
if (opt_deltas_valid) {
d_ceb = tp->delivered_ecn_bytes[INET_ECN_CE - 1] - old_ceb;
if (!d_ceb)
return delta;
if ((delivered_pkts >= (TCP_ACCECN_CEP_ACE_MASK + 1) * 2) &&
(tcp_is_sack(tp) ||
((1 << inet_csk(sk)->icsk_ca_state) &
(TCPF_CA_Open | TCPF_CA_CWR)))) {
u32 est_d_cep;
if (delivered_bytes <= d_ceb)
return safe_delta;
est_d_cep = DIV_ROUND_UP_ULL((u64)d_ceb *
delivered_pkts,
delivered_bytes);
return min(safe_delta,
delta +
(est_d_cep & ~TCP_ACCECN_CEP_ACE_MASK));
}
if (d_ceb > delta * tp->mss_cache)
return safe_delta;
if (d_ceb <
safe_delta * tp->mss_cache >> TCP_ACCECN_SAFETY_SHIFT)
return delta;
}
return safe_delta;
}
static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
u32 delivered_pkts, u32 delivered_bytes,
int *flag)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 delta;
delta = __tcp_accecn_process(sk, skb, delivered_pkts,
delivered_bytes, *flag);
if (delta > 0) {
tcp_count_delivered_ce(tp, delta);
*flag |= FLAG_ECE;
/* Recalculate header predictor */
if (tp->pred_flags)
tcp_fast_path_on(tp);
}
return delta;
}
/* Buffer size and advertised window tuning.
*
* 1. Tuning sk->sk_sndbuf, when connection enters established state.
@@ -987,6 +1177,7 @@ struct tcp_sacktag_state {
u64 last_sackt;
u32 reord;
u32 sack_delivered;
u32 delivered_bytes;
int flag;
unsigned int mss_now;
struct rate_sample *rate;
@@ -1348,7 +1539,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
static u8 tcp_sacktag_one(struct sock *sk,
struct tcp_sacktag_state *state, u8 sacked,
u32 start_seq, u32 end_seq,
int dup_sack, int pcount,
int dup_sack, int pcount, u32 plen,
u64 xmit_time)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -1408,6 +1599,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
tp->sacked_out += pcount;
/* Out-of-order packets delivered */
state->sack_delivered += pcount;
state->delivered_bytes += plen;
}
/* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1444,7 +1636,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
* tcp_highest_sack_seq() when skb is highest_sack.
*/
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
start_seq, end_seq, dup_sack, pcount,
start_seq, end_seq, dup_sack, pcount, skb->len,
tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);
@@ -1729,6 +1921,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
TCP_SKB_CB(skb)->end_seq,
dup_sack,
tcp_skb_pcount(skb),
skb->len,
tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
@@ -3237,6 +3430,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
if (sacked & TCPCB_SACKED_ACKED) {
tp->sacked_out -= acked_pcount;
/* snd_una delta covers these skbs */
sack->delivered_bytes -= skb->len;
} else if (tcp_is_sack(tp)) {
tcp_count_delivered(tp, acked_pcount, ece_ack);
if (!tcp_skb_spurious_retrans(tp, skb))
@@ -3333,6 +3528,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
if (before(reord, prior_fack))
tcp_check_sack_reordering(sk, reord, 0);
}
sack->delivered_bytes = (skb ?
TCP_SKB_CB(skb)->seq : tp->snd_una) -
prior_snd_una;
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
tcp_skb_timestamp_us(skb))) {
@@ -3602,8 +3801,18 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
}
static void tcp_send_ack_reflect_ect(struct sock *sk, bool accecn_reflector)
{
struct tcp_sock *tp = tcp_sk(sk);
u16 flags = 0;
if (accecn_reflector)
flags = tcp_accecn_reflector_flags(tp->syn_ect_rcv);
__tcp_send_ack(sk, tp->rcv_nxt, flags);
}
/* RFC 5961 7 [ACK Throttling] */
static void tcp_send_challenge_ack(struct sock *sk)
static void tcp_send_challenge_ack(struct sock *sk, bool accecn_reflector)
{
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
@@ -3633,7 +3842,7 @@ static void tcp_send_challenge_ack(struct sock *sk)
WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1);
send_ack:
NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
tcp_send_ack(sk);
tcp_send_ack_reflect_ect(sk, accecn_reflector);
}
}
@@ -3744,7 +3953,8 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
}
/* Returns the number of packets newly acked or sacked by the current ACK */
static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered,
u32 ecn_count, int flag)
{
const struct net *net = sock_net(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -3752,8 +3962,12 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
delivered = tp->delivered - prior_delivered;
NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
if (flag & FLAG_ECE)
NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
if (flag & FLAG_ECE) {
if (tcp_ecn_mode_rfc3168(tp))
ecn_count = delivered;
NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, ecn_count);
}
return delivered;
}
@@ -3774,11 +3988,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 delivered = tp->delivered;
u32 lost = tp->lost;
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
u32 ecn_count = 0; /* Did we receive ECE/an AccECN ACE update? */
u32 prior_fack;
sack_state.first_sackt = 0;
sack_state.rate = &rs;
sack_state.sack_delivered = 0;
sack_state.delivered_bytes = 0;
/* We very likely will need to access rtx queue. */
prefetch(sk->tcp_rtx_queue.rb_node);
@@ -3794,7 +4010,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
if (before(ack, prior_snd_una - max_window)) {
if (!(flag & FLAG_NO_CHALLENGE_ACK))
tcp_send_challenge_ack(sk);
tcp_send_challenge_ack(sk, false);
return -SKB_DROP_REASON_TCP_TOO_OLD_ACK;
}
goto old_ack;
@@ -3881,6 +4097,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_rack_update_reo_wnd(sk, &rs);
if (tcp_ecn_mode_accecn(tp))
ecn_count = tcp_accecn_process(sk, skb,
tp->delivered - delivered,
sack_state.delivered_bytes,
&flag);
tcp_in_ack_event(sk, flag);
if (tp->tlp_high_seq)
@@ -3905,7 +4127,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
sk_dst_confirm(sk);
delivered = tcp_newly_delivered(sk, delivered, flag);
delivered = tcp_newly_delivered(sk, delivered, ecn_count, flag);
lost = tp->lost - lost; /* freshly marked lost */
rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
@@ -3914,12 +4137,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
return 1;
no_queue:
if (tcp_ecn_mode_accecn(tp))
ecn_count = tcp_accecn_process(sk, skb,
tp->delivered - delivered,
sack_state.delivered_bytes,
&flag);
tcp_in_ack_event(sk, flag);
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK) {
tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
tcp_newly_delivered(sk, delivered, flag);
tcp_newly_delivered(sk, delivered, ecn_count, flag);
}
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
@@ -3940,7 +4168,7 @@ old_ack:
&sack_state);
tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
tcp_newly_delivered(sk, delivered, flag);
tcp_newly_delivered(sk, delivered, ecn_count, flag);
tcp_xmit_recovery(sk, rexmit);
}
@@ -4040,6 +4268,7 @@ void tcp_parse_options(const struct net *net,
ptr = (const unsigned char *)(th + 1);
opt_rx->saw_tstamp = 0;
opt_rx->accecn = 0;
opt_rx->saw_unknown = 0;
while (length > 0) {
@@ -4131,6 +4360,12 @@ void tcp_parse_options(const struct net *net,
ptr, th->syn, foc, false);
break;
case TCPOPT_ACCECN0:
case TCPOPT_ACCECN1:
/* Save offset of AccECN option in TCP header */
opt_rx->accecn = (ptr - 2) - (__u8 *)th;
break;
case TCPOPT_EXP:
/* Fast Open option shares code 254 using a
* 16 bits magic number.
@@ -4191,11 +4426,14 @@ static bool tcp_fast_parse_options(const struct net *net,
*/
if (th->doff == (sizeof(*th) / 4)) {
tp->rx_opt.saw_tstamp = 0;
tp->rx_opt.accecn = 0;
return false;
} else if (tp->rx_opt.tstamp_ok &&
th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
if (tcp_parse_aligned_timestamp(tp, th))
if (tcp_parse_aligned_timestamp(tp, th)) {
tp->rx_opt.accecn = 0;
return true;
}
}
tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
@@ -5828,6 +6066,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, int syn_inerr)
{
struct tcp_sock *tp = tcp_sk(sk);
bool accecn_reflector = false;
SKB_DR(reason);
/* RFC1323: H1. Apply PAWS check first. */
@@ -5925,7 +6164,7 @@ step1:
if (tp->syn_fastopen && !tp->data_segs_in &&
sk->sk_state == TCP_ESTABLISHED)
tcp_fastopen_active_disable(sk);
tcp_send_challenge_ack(sk);
tcp_send_challenge_ack(sk, false);
SKB_DR_SET(reason, TCP_RESET);
goto discard;
}
@@ -5936,6 +6175,16 @@ step1:
* RFC 5961 4.2 : Send a challenge ack
*/
if (th->syn) {
if (tcp_ecn_mode_accecn(tp)) {
accecn_reflector = true;
if (tp->rx_opt.accecn &&
tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn);
tcp_accecn_saw_opt_fail_recv(tp, saw_opt);
tcp_accecn_opt_demand_min(sk, 1);
}
}
if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack &&
TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq &&
TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt &&
@@ -5945,7 +6194,7 @@ syn_challenge:
if (syn_inerr)
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
tcp_send_challenge_ack(sk);
tcp_send_challenge_ack(sk, accecn_reflector);
SKB_DR_SET(reason, TCP_INVALID_SYN);
goto discard;
}
@@ -6017,6 +6266,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
*/
tp->rx_opt.saw_tstamp = 0;
tp->rx_opt.accecn = 0;
/* pred_flags is 0xS?10 << 16 + snd_wnd
* if header_prediction is to be made
@@ -6071,6 +6321,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
flag |= __tcp_replace_ts_recent(tp,
delta);
tcp_ecn_received_counters(sk, skb, 0);
/* We know that such packets are checksummed
* on entry.
*/
@@ -6119,6 +6371,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
/* Bulk data transfer: receiver */
tcp_cleanup_skb(skb);
__skb_pull(skb, tcp_header_len);
tcp_ecn_received_counters(sk, skb,
len - tcp_header_len);
eaten = tcp_queue_rcv(sk, skb, &fragstolen);
tcp_event_data_recv(sk, skb);
@@ -6159,6 +6413,8 @@ validate:
return;
step5:
tcp_ecn_received_counters_payload(sk, skb);
reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT);
if ((int)reason < 0) {
reason = -reason;
@@ -6409,7 +6665,9 @@ consume:
* state to ESTABLISHED..."
*/
tcp_ecn_rcv_synack(tp, th);
if (tcp_ecn_mode_any(tp))
tcp_ecn_rcv_synack(sk, skb, th,
TCP_SKB_CB(skb)->ip_dsfield);
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
tcp_try_undo_spurious_syn(sk);
@@ -6481,7 +6739,7 @@ consume:
TCP_DELACK_MAX, false);
goto consume;
}
tcp_send_ack(sk);
tcp_send_ack_reflect_ect(sk, tcp_ecn_mode_accecn(tp));
return -1;
}
@@ -6540,7 +6798,7 @@ consume:
tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
tp->max_window = tp->snd_wnd;
tcp_ecn_rcv_syn(tp, th);
tcp_ecn_rcv_syn(tp, th, skb);
tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
@@ -6722,7 +6980,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
}
/* accept old ack during closing */
if ((int)reason < 0) {
tcp_send_challenge_ack(sk);
tcp_send_challenge_ack(sk, false);
reason = -reason;
goto discard;
}
@@ -6769,9 +7027,12 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tp->lsndtime = tcp_jiffies32;
tcp_initialize_rcv_mss(sk);
if (tcp_ecn_mode_accecn(tp))
tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt);
tcp_fast_path_on(tp);
if (sk->sk_shutdown & SEND_SHUTDOWN)
tcp_shutdown(sk, SEND_SHUTDOWN);
break;
case TCP_FIN_WAIT1: {
@@ -6941,6 +7202,15 @@ static void tcp_ecn_create_request(struct request_sock *req,
bool ect, ecn_ok;
u32 ecn_ok_dst;
if (tcp_accecn_syn_requested(th) &&
READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) {
inet_rsk(req)->ecn_ok = 1;
tcp_rsk(req)->accecn_ok = 1;
tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
INET_ECN_MASK;
return;
}
if (!th_ecn)
return;
@@ -6948,7 +7218,8 @@ static void tcp_ecn_create_request(struct request_sock *req,
ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst;
if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
if (((!ect || th->res1 || th->ae) && ecn_ok) ||
tcp_ca_needs_ecn(listen_sk) ||
(ecn_ok_dst & DST_FEATURE_ECN_CA) ||
tcp_bpf_ca_needs_ecn((struct sock *)req))
inet_rsk(req)->ecn_ok = 1;
@@ -6966,6 +7237,11 @@ static void tcp_openreq_init(struct request_sock *req,
tcp_rsk(req)->snt_synack = 0;
tcp_rsk(req)->snt_tsval_first = 0;
tcp_rsk(req)->last_oow_ack_time = 0;
tcp_rsk(req)->accecn_ok = 0;
tcp_rsk(req)->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN;
tcp_rsk(req)->accecn_fail_mode = 0;
tcp_rsk(req)->syn_ect_rcv = 0;
tcp_rsk(req)->syn_ect_snt = 0;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
ireq->tstamp_ok = rx_opt->tstamp_ok;

View File

@@ -65,6 +65,7 @@
#include <net/icmp.h>
#include <net/inet_hashtables.h>
#include <net/tcp.h>
#include <net/tcp_ecn.h>
#include <net/transp_v6.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
@@ -1189,7 +1190,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
enum tcp_synack_type synack_type,
struct sk_buff *syn_skb)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
int err = -1;
struct sk_buff *skb;
@@ -1202,6 +1203,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
if (skb) {
tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
tos = READ_ONCE(inet_sk(sk)->tos);
@@ -3558,7 +3560,9 @@ fallback:
static int __net_init tcp_sk_init(struct net *net)
{
net->ipv4.sysctl_tcp_ecn = 2;
net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
net->ipv4.sysctl_tcp_ecn_fallback = 1;
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;

View File

@@ -20,6 +20,7 @@
*/
#include <net/tcp.h>
#include <net/tcp_ecn.h>
#include <net/xfrm.h>
#include <net/busy_poll.h>
#include <net/rstreason.h>
@@ -451,12 +452,26 @@ void tcp_openreq_init_rwin(struct request_sock *req,
ireq->rcv_wscale = rcv_wscale;
}
static void tcp_ecn_openreq_child(struct tcp_sock *tp,
const struct request_sock *req)
static void tcp_ecn_openreq_child(struct sock *sk,
const struct request_sock *req,
const struct sk_buff *skb)
{
tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
TCP_ECN_MODE_RFC3168 :
TCP_ECN_DISABLED);
const struct tcp_request_sock *treq = tcp_rsk(req);
struct tcp_sock *tp = tcp_sk(sk);
if (treq->accecn_ok) {
tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
tp->syn_ect_snt = treq->syn_ect_snt;
tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt);
tp->saw_accecn_opt = treq->saw_accecn_opt;
tp->prev_ecnfield = treq->syn_ect_rcv;
tp->accecn_opt_demand = 1;
tcp_ecn_received_counters_payload(sk, skb);
} else {
tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
TCP_ECN_MODE_RFC3168 :
TCP_ECN_DISABLED);
}
}
void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
@@ -621,7 +636,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
newtp->rx_opt.mss_clamp = req->mss;
tcp_ecn_openreq_child(newtp, req);
tcp_ecn_openreq_child(newsk, req, skb);
newtp->fastopen_req = NULL;
RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
@@ -664,6 +679,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
bool own_req;
tmp_opt.saw_tstamp = 0;
tmp_opt.accecn = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
@@ -841,6 +857,18 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (!(flg & TCP_FLAG_ACK))
return NULL;
if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn &&
tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn);
tcp_rsk(req)->saw_accecn_opt = saw_opt;
if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) {
u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV;
tcp_rsk(req)->accecn_fail_mode |= fail_mode;
}
}
/* For Fast Open no more processing is needed (sk is the
* child socket).
*/

View File

@@ -328,7 +328,15 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
{
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_ecn_mode_rfc3168(tp)) {
if (!tcp_ecn_mode_any(tp))
return;
if (tcp_ecn_mode_accecn(tp)) {
if (!tcp_accecn_ace_fail_recv(tp))
INET_ECN_xmit(sk);
tcp_accecn_set_ace(tp, skb, th);
skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN;
} else {
/* Not-retransmitted data segment: set ECT and inject CWR. */
if (skb->len != tcp_header_len &&
!before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
@@ -377,6 +385,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
#define OPTION_SMC BIT(9)
#define OPTION_MPTCP BIT(10)
#define OPTION_AO BIT(11)
#define OPTION_ACCECN BIT(12)
static void smc_options_write(__be32 *ptr, u16 *options)
{
@@ -398,6 +407,8 @@ struct tcp_out_options {
u16 mss; /* 0 to disable */
u8 ws; /* window scale, 0 to disable */
u8 num_sack_blocks; /* number of SACK blocks to include */
u8 num_accecn_fields:7, /* number of AccECN fields needed */
use_synack_ecn_bytes:1; /* Use synack_ecn_bytes or not */
u8 hash_size; /* bytes in hash_location */
u8 bpf_opt_len; /* length of BPF hdr option */
__u8 *hash_location; /* temporary pointer, overloaded */
@@ -595,6 +606,11 @@ static __be32 *process_tcp_ao_options(struct tcp_sock *tp,
return ptr;
}
/* Initial values for AccECN option, ordered is based on ECN field bits
* similar to received_ecn_bytes. Used for SYN/ACK AccECN option.
*/
static const u32 synack_ecn_bytes[3] = { 0, 0, 0 };
/* Write previously computed TCP options to the packet.
*
* Beware: Something in the Internet is very sensitive to the ordering of
@@ -613,6 +629,8 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
struct tcp_out_options *opts,
struct tcp_key *key)
{
u8 leftover_highbyte = TCPOPT_NOP; /* replace 1st NOP if avail */
u8 leftover_lowbyte = TCPOPT_NOP; /* replace 2nd NOP in succession */
__be32 *ptr = (__be32 *)(th + 1);
u16 options = opts->options; /* mungable copy */
@@ -648,15 +666,75 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
*ptr++ = htonl(opts->tsecr);
}
if (OPTION_ACCECN & options) {
const u32 *ecn_bytes = opts->use_synack_ecn_bytes ?
synack_ecn_bytes :
tp->received_ecn_bytes;
const u8 ect0_idx = INET_ECN_ECT_0 - 1;
const u8 ect1_idx = INET_ECN_ECT_1 - 1;
const u8 ce_idx = INET_ECN_CE - 1;
u32 e0b;
u32 e1b;
u32 ceb;
u8 len;
e0b = ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET;
e1b = ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET;
ceb = ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET;
len = TCPOLEN_ACCECN_BASE +
opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD;
if (opts->num_accecn_fields == 2) {
*ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
((e1b >> 8) & 0xffff));
*ptr++ = htonl(((e1b & 0xff) << 24) |
(ceb & 0xffffff));
} else if (opts->num_accecn_fields == 1) {
*ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
((e1b >> 8) & 0xffff));
leftover_highbyte = e1b & 0xff;
leftover_lowbyte = TCPOPT_NOP;
} else if (opts->num_accecn_fields == 0) {
leftover_highbyte = TCPOPT_ACCECN1;
leftover_lowbyte = len;
} else if (opts->num_accecn_fields == 3) {
*ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
((e1b >> 8) & 0xffff));
*ptr++ = htonl(((e1b & 0xff) << 24) |
(ceb & 0xffffff));
*ptr++ = htonl(((e0b & 0xffffff) << 8) |
TCPOPT_NOP);
}
if (tp) {
tp->accecn_minlen = 0;
tp->accecn_opt_tstamp = tp->tcp_mstamp;
if (tp->accecn_opt_demand)
tp->accecn_opt_demand--;
}
}
if (unlikely(OPTION_SACK_ADVERTISE & options)) {
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
*ptr++ = htonl((leftover_highbyte << 24) |
(leftover_lowbyte << 16) |
(TCPOPT_SACK_PERM << 8) |
TCPOLEN_SACK_PERM);
leftover_highbyte = TCPOPT_NOP;
leftover_lowbyte = TCPOPT_NOP;
}
if (unlikely(OPTION_WSCALE & options)) {
*ptr++ = htonl((TCPOPT_NOP << 24) |
u8 highbyte = TCPOPT_NOP;
/* Do not split the leftover 2-byte to fit into a single
* NOP, i.e., replace this NOP only when 1 byte is leftover
* within leftover_highbyte.
*/
if (unlikely(leftover_highbyte != TCPOPT_NOP &&
leftover_lowbyte == TCPOPT_NOP)) {
highbyte = leftover_highbyte;
leftover_highbyte = TCPOPT_NOP;
}
*ptr++ = htonl((highbyte << 24) |
(TCPOPT_WINDOW << 16) |
(TCPOLEN_WINDOW << 8) |
opts->ws);
@@ -667,11 +745,13 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
tp->duplicate_sack : tp->selective_acks;
int this_sack;
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
*ptr++ = htonl((leftover_highbyte << 24) |
(leftover_lowbyte << 16) |
(TCPOPT_SACK << 8) |
(TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
TCPOLEN_SACK_PERBLOCK)));
leftover_highbyte = TCPOPT_NOP;
leftover_lowbyte = TCPOPT_NOP;
for (this_sack = 0; this_sack < opts->num_sack_blocks;
++this_sack) {
@@ -680,6 +760,14 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
}
tp->rx_opt.dsack = 0;
} else if (unlikely(leftover_highbyte != TCPOPT_NOP ||
leftover_lowbyte != TCPOPT_NOP)) {
*ptr++ = htonl((leftover_highbyte << 24) |
(leftover_lowbyte << 16) |
(TCPOPT_NOP << 8) |
TCPOPT_NOP);
leftover_highbyte = TCPOPT_NOP;
leftover_lowbyte = TCPOPT_NOP;
}
if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
@@ -760,6 +848,80 @@ static void mptcp_set_option_cond(const struct request_sock *req,
}
}
static u32 tcp_synack_options_combine_saving(struct tcp_out_options *opts)
{
/* How much there's room for combining with the alignment padding? */
if ((opts->options & (OPTION_SACK_ADVERTISE | OPTION_TS)) ==
OPTION_SACK_ADVERTISE)
return 2;
else if (opts->options & OPTION_WSCALE)
return 1;
return 0;
}
/* Calculates how long AccECN option will fit to @remaining option space.
*
* AccECN option can sometimes replace NOPs used for alignment of other
* TCP options (up to @max_combine_saving available).
*
* Only solutions with at least @required AccECN fields are accepted.
*
* Returns: The size of the AccECN option excluding space repurposed from
* the alignment of the other options.
*/
static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required,
int remaining)
{
int size = TCP_ACCECN_MAXSIZE;
int sack_blocks_reduce = 0;
int max_combine_saving;
int rem = remaining;
int align_size;
if (opts->use_synack_ecn_bytes)
max_combine_saving = tcp_synack_options_combine_saving(opts);
else
max_combine_saving = opts->num_sack_blocks > 0 ? 2 : 0;
opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS;
while (opts->num_accecn_fields >= required) {
/* Pad to dword if cannot combine */
if ((size & 0x3) > max_combine_saving)
align_size = ALIGN(size, 4);
else
align_size = ALIGN_DOWN(size, 4);
if (rem >= align_size) {
size = align_size;
break;
} else if (opts->num_accecn_fields == required &&
opts->num_sack_blocks > 2 &&
required > 0) {
/* Try to fit the option by removing one SACK block */
opts->num_sack_blocks--;
sack_blocks_reduce++;
rem = rem + TCPOLEN_SACK_PERBLOCK;
opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS;
size = TCP_ACCECN_MAXSIZE;
continue;
}
opts->num_accecn_fields--;
size -= TCPOLEN_ACCECN_PERFIELD;
}
if (sack_blocks_reduce > 0) {
if (opts->num_accecn_fields >= required)
size -= sack_blocks_reduce * TCPOLEN_SACK_PERBLOCK;
else
opts->num_sack_blocks += sack_blocks_reduce;
}
if (opts->num_accecn_fields < required)
return 0;
opts->options |= OPTION_ACCECN;
return size;
}
/* Compute TCP options for SYN packets. This is not the final
* network wire format yet.
*/
@@ -842,6 +1004,20 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
}
}
/* Simultaneous open SYN/ACK needs AccECN option but not SYN.
* It is attempted to negotiate the use of AccECN also on the first
* retransmitted SYN, as mentioned in "3.1.4.1. Retransmitted SYNs"
* of AccECN draft.
*/
if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) &&
tcp_ecn_mode_accecn(tp) &&
inet_csk(sk)->icsk_retransmits < 2 &&
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) &&
remaining >= TCPOLEN_ACCECN_BASE)) {
opts->use_synack_ecn_bytes = 1;
remaining -= tcp_options_fit_accecn(opts, 0, remaining);
}
bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
return MAX_TCP_OPTION_SPACE - remaining;
@@ -859,6 +1035,7 @@ static unsigned int tcp_synack_options(const struct sock *sk,
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
struct tcp_request_sock *treq = tcp_rsk(req);
if (tcp_key_is_md5(key)) {
opts->options |= OPTION_MD5;
@@ -921,6 +1098,13 @@ static unsigned int tcp_synack_options(const struct sock *sk,
smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
if (treq->accecn_ok &&
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) &&
req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) {
opts->use_synack_ecn_bytes = 1;
remaining -= tcp_options_fit_accecn(opts, 0, remaining);
}
bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
synack_type, opts, &remaining);
@@ -977,17 +1161,32 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
if (unlikely(eff_sacks)) {
const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
TCPOLEN_SACK_PERBLOCK))
return size;
if (likely(remaining >= TCPOLEN_SACK_BASE_ALIGNED +
TCPOLEN_SACK_PERBLOCK)) {
opts->num_sack_blocks =
min_t(unsigned int, eff_sacks,
(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
TCPOLEN_SACK_PERBLOCK);
opts->num_sack_blocks =
min_t(unsigned int, eff_sacks,
(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
TCPOLEN_SACK_PERBLOCK);
size += TCPOLEN_SACK_BASE_ALIGNED +
opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
} else {
opts->num_sack_blocks = 0;
}
} else {
opts->num_sack_blocks = 0;
}
size += TCPOLEN_SACK_BASE_ALIGNED +
opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
if (tcp_ecn_mode_accecn(tp)) {
int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option);
if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) &&
(ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand ||
tcp_accecn_option_beacon_check(sk))) {
opts->use_synack_ecn_bytes = 0;
size += tcp_options_fit_accecn(opts, tp->accecn_minlen,
MAX_TCP_OPTION_SPACE - size);
}
}
if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
@@ -2697,6 +2896,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
sent_pkts = 0;
tcp_mstamp_refresh(tp);
/* AccECN option beacon depends on mstamp, it may change mss */
if (tcp_ecn_mode_accecn(tp) && tcp_accecn_option_beacon_check(sk))
mss_now = tcp_current_mss(sk);
if (!push_one) {
/* Do MTU probing. */
result = tcp_mtu_probe(sk);
@@ -3349,7 +3553,10 @@ start:
tcp_retrans_try_collapse(sk, skb, avail_wnd);
}
/* RFC3168, section 6.1.1.1. ECN fallback */
/* RFC3168, section 6.1.1.1. ECN fallback
* As AccECN uses the same SYN flags (+ AE), this check covers both
* cases.
*/
if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
tcp_ecn_clear_syn(sk, skb);

View File

@@ -16,6 +16,7 @@
#include <net/secure_seq.h>
#include <net/ipv6.h>
#include <net/tcp.h>
#include <net/tcp_ecn.h>
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
@@ -264,6 +265,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
if (!req->syncookie)
ireq->rcv_wscale = rcv_wscale;
ireq->ecn_ok &= cookie_ecn_ok(net, dst);
tcp_rsk(req)->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th);
ret = tcp_get_cookie_sock(sk, skb, req, dst);
if (!ret) {

View File

@@ -544,6 +544,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
if (skb) {
tcp_rsk(req)->syn_ect_snt = np->tclass & INET_ECN_MASK;
__tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
&ireq->ir_v6_rmt_addr);