From patchwork Tue Jul 3 22:42:47 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [v2, net-next, 01/14] net: Clear skb->tstamp only on the forwarding path X-Patchwork-Submitter: Jesus Sanchez-Palencia X-Patchwork-Id: 938987 X-Patchwork-Delegate: davem@davemloft.net Message-Id: <20180703224300.25300-2-jesus.sanchez-palencia@intel.com> To: netdev@vger.kernel.org Cc: tglx@linutronix.de, jan.altenberg@linutronix.de, vinicius.gomes@intel.com, kurt.kanzenbach@linutronix.de, henrik@austad.us, richardcochran@gmail.com, ilias.apalodimas@linaro.org, ivan.khoronzhuk@linaro.org, mlichvar@redhat.com, willemb@google.com, jhs@mojatatu.com, xiyou.wangcong@gmail.com, jiri@resnulli.us, eric.dumazet@gmail.com, jeffrey.t.kirsher@intel.com Date: Tue, 3 Jul 2018 15:42:47 -0700 From: Jesus Sanchez-Palencia List-Id: This is done in preparation for the upcoming time based transmission patchset. Now that skb->tstamp will be used to hold packet's txtime, we must ensure that it is being cleared when traversing namespaces. Also, doing that from skb_scrub_packet() before the early return would break our feature when tunnels are used. Signed-off-by: Jesus Sanchez-Palencia --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-4.18.5-rt3/net/core/skbuff.c =================================================================== --- linux-4.18.5-rt3.orig/net/core/skbuff.c +++ linux-4.18.5-rt3/net/core/skbuff.c @@ -4899,7 +4899,6 @@ EXPORT_SYMBOL(skb_try_coalesce); */ void skb_scrub_packet(struct sk_buff *skb, bool xnet) { - skb->tstamp = 0; skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; @@ -4914,6 +4913,7 @@ void skb_scrub_packet(struct sk_buff *sk ipvs_reset(skb); skb_orphan(skb); skb->mark = 0; + skb->tstamp = 0; } EXPORT_SYMBOL_GPL(skb_scrub_packet); Index: linux-4.18.5-rt3/arch/alpha/include/uapi/asm/socket.h =================================================================== --- linux-4.18.5-rt3.orig/arch/alpha/include/uapi/asm/socket.h +++ linux-4.18.5-rt3/arch/alpha/include/uapi/asm/socket.h @@ -112,4 +112,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* _UAPI_ASM_SOCKET_H */ Index: linux-4.18.5-rt3/arch/ia64/include/uapi/asm/socket.h =================================================================== --- linux-4.18.5-rt3.orig/arch/ia64/include/uapi/asm/socket.h +++ linux-4.18.5-rt3/arch/ia64/include/uapi/asm/socket.h @@ -114,4 +114,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* _ASM_IA64_SOCKET_H */ Index: linux-4.18.5-rt3/arch/mips/include/uapi/asm/socket.h =================================================================== --- linux-4.18.5-rt3.orig/arch/mips/include/uapi/asm/socket.h +++ linux-4.18.5-rt3/arch/mips/include/uapi/asm/socket.h @@ -123,4 +123,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* _UAPI_ASM_SOCKET_H */ Index: linux-4.18.5-rt3/arch/parisc/include/uapi/asm/socket.h =================================================================== --- linux-4.18.5-rt3.orig/arch/parisc/include/uapi/asm/socket.h +++ linux-4.18.5-rt3/arch/parisc/include/uapi/asm/socket.h @@ -104,4 +104,7 @@ #define SO_ZEROCOPY 0x4035 +#define SO_TXTIME 0x4036 +#define SCM_TXTIME SO_TXTIME + #endif /* _UAPI_ASM_SOCKET_H */ Index: linux-4.18.5-rt3/arch/s390/include/uapi/asm/socket.h =================================================================== --- linux-4.18.5-rt3.orig/arch/s390/include/uapi/asm/socket.h +++ linux-4.18.5-rt3/arch/s390/include/uapi/asm/socket.h @@ -111,4 +111,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* _ASM_SOCKET_H */ Index: linux-4.18.5-rt3/arch/sparc/include/uapi/asm/socket.h =================================================================== --- linux-4.18.5-rt3.orig/arch/sparc/include/uapi/asm/socket.h +++ linux-4.18.5-rt3/arch/sparc/include/uapi/asm/socket.h @@ -101,6 +101,9 @@ #define SO_ZEROCOPY 0x003e +#define SO_TXTIME 0x003f +#define SCM_TXTIME SO_TXTIME + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 Index: linux-4.18.5-rt3/arch/xtensa/include/uapi/asm/socket.h =================================================================== --- linux-4.18.5-rt3.orig/arch/xtensa/include/uapi/asm/socket.h +++ linux-4.18.5-rt3/arch/xtensa/include/uapi/asm/socket.h @@ -116,4 +116,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* _XTENSA_SOCKET_H */ Index: linux-4.18.5-rt3/include/net/sock.h =================================================================== --- linux-4.18.5-rt3.orig/include/net/sock.h +++ linux-4.18.5-rt3/include/net/sock.h @@ -315,6 +315,9 @@ struct sock_common { * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 * @sk_reuseport_cb: reuseport group container * @sk_rcu: used during RCU grace period + * @sk_clockid: clockid used by time-based scheduling (SO_TXTIME) + * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME + * @sk_txtime_unused: unused txtime flags */ struct sock { /* @@ -468,6 +471,12 @@ struct sock { u8 sk_shutdown; u32 sk_tskey; atomic_t sk_zckey; + + u8 sk_clockid; + u8 sk_txtime_deadline_mode : 1, + sk_txtime_report_errors : 1, + sk_txtime_unused : 6; + struct socket *sk_socket; void *sk_user_data; #ifdef CONFIG_SECURITY @@ -783,6 +792,7 @@ enum sock_flags { SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */ SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */ SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */ + SOCK_TXTIME, }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) @@ -1578,6 +1588,7 @@ void sock_kzfree_s(struct sock *sk, void void sk_send_sigurg(struct sock *sk); struct sockcm_cookie { + u64 transmit_time; u32 mark; u16 tsflags; }; Index: linux-4.18.5-rt3/include/uapi/asm-generic/socket.h =================================================================== --- linux-4.18.5-rt3.orig/include/uapi/asm-generic/socket.h +++ linux-4.18.5-rt3/include/uapi/asm-generic/socket.h @@ -107,4 +107,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* __ASM_GENERIC_SOCKET_H */ Index: linux-4.18.5-rt3/include/uapi/linux/net_tstamp.h =================================================================== --- linux-4.18.5-rt3.orig/include/uapi/linux/net_tstamp.h +++ linux-4.18.5-rt3/include/uapi/linux/net_tstamp.h @@ -141,4 +141,22 @@ struct scm_ts_pktinfo { __u32 reserved[2]; }; +/* + * SO_TXTIME gets a struct sock_txtime with flags being an integer bit + * field comprised of these values. + */ +enum txtime_flags { + SOF_TXTIME_DEADLINE_MODE = (1 << 0), + SOF_TXTIME_REPORT_ERRORS = (1 << 1), + + SOF_TXTIME_FLAGS_LAST = SOF_TXTIME_REPORT_ERRORS, + SOF_TXTIME_FLAGS_MASK = (SOF_TXTIME_FLAGS_LAST - 1) | + SOF_TXTIME_FLAGS_LAST +}; + +struct sock_txtime { + clockid_t clockid; /* reference clockid */ + u32 flags; /* flags defined by enum txtime_flags */ +}; + #endif /* _NET_TIMESTAMPING_H */ Index: linux-4.18.5-rt3/net/core/sock.c =================================================================== --- linux-4.18.5-rt3.orig/net/core/sock.c +++ linux-4.18.5-rt3/net/core/sock.c @@ -91,6 +91,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -697,6 +698,7 @@ EXPORT_SYMBOL(sk_mc_loop); int sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { + struct sock_txtime sk_txtime; struct sock *sk = sock->sk; int val; int valbool; @@ -1070,6 +1072,26 @@ set_rcvbuf: } break; + case SO_TXTIME: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + } else if (optlen != sizeof(struct sock_txtime)) { + ret = -EINVAL; + } else if (copy_from_user(&sk_txtime, optval, + sizeof(struct sock_txtime))) { + ret = -EFAULT; + } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { + ret = -EINVAL; + } else { + sock_valbool_flag(sk, SOCK_TXTIME, true); + sk->sk_clockid = sk_txtime.clockid; + sk->sk_txtime_deadline_mode = + !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); + sk->sk_txtime_report_errors = + !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); + } + break; + default: ret = -ENOPROTOOPT; break; @@ -1115,6 +1137,7 @@ int sock_getsockopt(struct socket *sock, u64 val64; struct linger ling; struct timeval tm; + struct sock_txtime txtime; } v; int lv = sizeof(int); @@ -1403,6 +1426,15 @@ int sock_getsockopt(struct socket *sock, v.val = sock_flag(sk, SOCK_ZEROCOPY); break; + case SO_TXTIME: + lv = sizeof(v.txtime); + v.txtime.clockid = sk->sk_clockid; + v.txtime.flags |= sk->sk_txtime_deadline_mode ? + SOF_TXTIME_DEADLINE_MODE : 0; + v.txtime.flags |= sk->sk_txtime_report_errors ? + SOF_TXTIME_REPORT_ERRORS : 0; + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -2137,6 +2169,13 @@ int __sock_cmsg_send(struct sock *sk, st sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; sockc->tsflags |= tsflags; break; + case SCM_TXTIME: + if (!sock_flag(sk, SOCK_TXTIME)) + return -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) + return -EINVAL; + sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); + break; /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ case SCM_RIGHTS: case SCM_CREDENTIALS: Index: linux-4.18.5-rt3/include/net/inet_sock.h =================================================================== --- linux-4.18.5-rt3.orig/include/net/inet_sock.h +++ linux-4.18.5-rt3/include/net/inet_sock.h @@ -148,6 +148,7 @@ struct inet_cork { __s16 tos; char priority; __u16 gso_size; + u64 transmit_time; }; struct inet_cork_full { Index: linux-4.18.5-rt3/net/ipv4/icmp.c =================================================================== --- linux-4.18.5-rt3.orig/net/ipv4/icmp.c +++ linux-4.18.5-rt3/net/ipv4/icmp.c @@ -437,6 +437,7 @@ static void icmp_reply(struct icmp_bxm * ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; + ipc.sockc.transmit_time = 0; if (icmp_param->replyopts.opt.opt.optlen) { ipc.opt = &icmp_param->replyopts.opt; @@ -715,6 +716,7 @@ void icmp_send(struct sk_buff *skb_in, i ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; + ipc.sockc.transmit_time = 0; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, type, code, &icmp_param); Index: linux-4.18.5-rt3/net/ipv4/ip_output.c =================================================================== --- linux-4.18.5-rt3.orig/net/ipv4/ip_output.c +++ linux-4.18.5-rt3/net/ipv4/ip_output.c @@ -1155,6 +1155,7 @@ static int ip_setup_cork(struct sock *sk cork->tos = ipc->tos; cork->priority = ipc->priority; cork->tx_flags = ipc->tx_flags; + cork->transmit_time = ipc->sockc.transmit_time; return 0; } @@ -1415,6 +1416,7 @@ struct sk_buff *__ip_make_skb(struct soc skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = cork->transmit_time; /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec * on dst refcount @@ -1552,6 +1554,7 @@ void ip_send_unicast_reply(struct sock * ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; + ipc.sockc.transmit_time = 0; if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; Index: linux-4.18.5-rt3/net/ipv4/ping.c =================================================================== --- linux-4.18.5-rt3.orig/net/ipv4/ping.c +++ linux-4.18.5-rt3/net/ipv4/ping.c @@ -746,6 +746,7 @@ static int ping_v4_sendmsg(struct sock * ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; + ipc.sockc.transmit_time = 0; if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); Index: linux-4.18.5-rt3/net/ipv4/raw.c =================================================================== --- linux-4.18.5-rt3.orig/net/ipv4/raw.c +++ linux-4.18.5-rt3/net/ipv4/raw.c @@ -381,6 +381,7 @@ static int raw_send_hdrinc(struct sock * skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = sockc->transmit_time; skb_dst_set(skb, &rt->dst); *rtp = NULL; @@ -562,6 +563,7 @@ static int raw_sendmsg(struct sock *sk, } ipc.sockc.tsflags = sk->sk_tsflags; + ipc.sockc.transmit_time = 0; ipc.addr = inet->inet_saddr; ipc.opt = NULL; ipc.tx_flags = 0; Index: linux-4.18.5-rt3/net/ipv4/udp.c =================================================================== --- linux-4.18.5-rt3.orig/net/ipv4/udp.c +++ linux-4.18.5-rt3/net/ipv4/udp.c @@ -930,6 +930,7 @@ int udp_sendmsg(struct sock *sk, struct ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; + ipc.sockc.transmit_time = 0; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; Index: linux-4.18.5-rt3/net/ipv6/ip6_output.c =================================================================== --- linux-4.18.5-rt3.orig/net/ipv6/ip6_output.c +++ linux-4.18.5-rt3/net/ipv6/ip6_output.c @@ -1160,7 +1160,8 @@ static void ip6_append_data_mtu(unsigned static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, - struct rt6_info *rt, struct flowi6 *fl6) + struct rt6_info *rt, struct flowi6 *fl6, + const struct sockcm_cookie *sockc) { struct ipv6_pinfo *np = inet6_sk(sk); unsigned int mtu; @@ -1228,6 +1229,8 @@ static int ip6_setup_cork(struct sock *s cork->base.flags |= IPCORK_ALLFRAG; cork->base.length = 0; + cork->base.transmit_time = sockc->transmit_time; + return 0; } @@ -1577,7 +1580,7 @@ int ip6_append_data(struct sock *sk, * setup for corking */ err = ip6_setup_cork(sk, &inet->cork, &np->cork, - ipc6, rt, fl6); + ipc6, rt, fl6, sockc); if (err) return err; @@ -1675,6 +1678,8 @@ struct sk_buff *__ip6_make_skb(struct so skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = cork->base.transmit_time; + skb_dst_set(skb, dst_clone(&rt->dst)); IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); if (proto == IPPROTO_ICMPV6) { @@ -1767,7 +1772,7 @@ struct sk_buff *ip6_make_skb(struct sock cork->base.opt = NULL; cork->base.dst = NULL; v6_cork.opt = NULL; - err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6); + err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6, sockc); if (err) { ip6_cork_release(cork, &v6_cork); return ERR_PTR(err); Index: linux-4.18.5-rt3/net/ipv6/raw.c =================================================================== --- linux-4.18.5-rt3.orig/net/ipv6/raw.c +++ linux-4.18.5-rt3/net/ipv6/raw.c @@ -620,7 +620,7 @@ out: static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, struct flowi6 *fl6, struct dst_entry **dstp, - unsigned int flags) + unsigned int flags, const struct sockcm_cookie *sockc) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); @@ -650,6 +650,7 @@ static int rawv6_send_hdrinc(struct sock skb->protocol = htons(ETH_P_IPV6); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = sockc->transmit_time; skb_dst_set(skb, &rt->dst); *dstp = NULL; @@ -848,6 +849,7 @@ static int rawv6_sendmsg(struct sock *sk fl6.flowi6_oif = sk->sk_bound_dev_if; sockc.tsflags = sk->sk_tsflags; + sockc.transmit_time = 0; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); @@ -921,7 +923,8 @@ static int rawv6_sendmsg(struct sock *sk back_from_confirm: if (inet->hdrincl) - err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, msg->msg_flags); + err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, + msg->msg_flags, &sockc); else { ipc6.opt = opt; lock_sock(sk); Index: linux-4.18.5-rt3/net/ipv6/udp.c =================================================================== --- linux-4.18.5-rt3.orig/net/ipv6/udp.c +++ linux-4.18.5-rt3/net/ipv6/udp.c @@ -1148,6 +1148,7 @@ int udpv6_sendmsg(struct sock *sk, struc ipc6.dontfrag = -1; ipc6.gso_size = up->gso_size; sockc.tsflags = sk->sk_tsflags; + sockc.transmit_time = 0; /* destination address check */ if (sin6) { Index: linux-4.18.5-rt3/net/packet/af_packet.c =================================================================== --- linux-4.18.5-rt3.orig/net/packet/af_packet.c +++ linux-4.18.5-rt3/net/packet/af_packet.c @@ -1951,6 +1951,7 @@ retry: goto out_unlock; } + sockc.transmit_time = 0; sockc.tsflags = sk->sk_tsflags; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); @@ -1962,6 +1963,7 @@ retry: skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = sockc.transmit_time; sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags); @@ -2457,6 +2459,7 @@ static int tpacket_fill_skb(struct packe skb->dev = dev; skb->priority = po->sk.sk_priority; skb->mark = po->sk.sk_mark; + skb->tstamp = sockc->transmit_time; sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); skb_shinfo(skb)->destructor_arg = ph.raw; @@ -2633,6 +2636,7 @@ static int tpacket_snd(struct packet_soc if (unlikely(!(dev->flags & IFF_UP))) goto out_put; + sockc.transmit_time = 0; sockc.tsflags = po->sk.sk_tsflags; if (msg->msg_controllen) { err = sock_cmsg_send(&po->sk, msg, &sockc); @@ -2829,6 +2833,7 @@ static int packet_snd(struct socket *soc if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; + sockc.transmit_time = 0; sockc.tsflags = sk->sk_tsflags; sockc.mark = sk->sk_mark; if (msg->msg_controllen) { @@ -2905,6 +2910,7 @@ static int packet_snd(struct socket *soc skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sockc.mark; + skb->tstamp = sockc.transmit_time; if (has_vnet_hdr) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); Index: linux-4.18.5-rt3/include/net/pkt_sched.h =================================================================== --- linux-4.18.5-rt3.orig/include/net/pkt_sched.h +++ linux-4.18.5-rt3/include/net/pkt_sched.h @@ -72,6 +72,8 @@ struct qdisc_watchdog { struct Qdisc *qdisc; }; +void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, + clockid_t clockid); void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc); void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires); @@ -153,4 +155,9 @@ struct tc_cbs_qopt_offload { s32 sendslope; }; +struct tc_etf_qopt_offload { + u8 enable; + s32 queue; +}; + #endif Index: linux-4.18.5-rt3/net/sched/sch_api.c =================================================================== --- linux-4.18.5-rt3.orig/net/sched/sch_api.c +++ linux-4.18.5-rt3/net/sched/sch_api.c @@ -596,12 +596,19 @@ static enum hrtimer_restart qdisc_watchd return HRTIMER_NORESTART; } -void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) +void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, + clockid_t clockid) { - hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED); wd->timer.function = qdisc_watchdog; wd->qdisc = qdisc; } +EXPORT_SYMBOL(qdisc_watchdog_init_clockid); + +void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) +{ + qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC); +} EXPORT_SYMBOL(qdisc_watchdog_init); void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) Index: linux-4.18.5-rt3/include/linux/netdevice.h =================================================================== --- linux-4.18.5-rt3.orig/include/linux/netdevice.h +++ linux-4.18.5-rt3/include/linux/netdevice.h @@ -792,6 +792,7 @@ enum tc_setup_type { TC_SETUP_QDISC_RED, TC_SETUP_QDISC_PRIO, TC_SETUP_QDISC_MQ, + TC_SETUP_QDISC_ETF, }; /* These structures hold the attributes of bpf state that are being passed Index: linux-4.18.5-rt3/include/uapi/linux/pkt_sched.h =================================================================== --- linux-4.18.5-rt3.orig/include/uapi/linux/pkt_sched.h +++ linux-4.18.5-rt3/include/uapi/linux/pkt_sched.h @@ -934,4 +934,22 @@ enum { #define TCA_CBS_MAX (__TCA_CBS_MAX - 1) + +/* ETF */ +struct tc_etf_qopt { + __s32 delta; + __s32 clockid; + __u32 flags; +#define TC_ETF_DEADLINE_MODE_ON BIT(0) +#define TC_ETF_OFFLOAD_ON BIT(1) +}; + +enum { + TCA_ETF_UNSPEC, + TCA_ETF_PARMS, + __TCA_ETF_MAX, +}; + +#define TCA_ETF_MAX (__TCA_ETF_MAX - 1) + #endif Index: linux-4.18.5-rt3/net/sched/Kconfig =================================================================== --- linux-4.18.5-rt3.orig/net/sched/Kconfig +++ linux-4.18.5-rt3/net/sched/Kconfig @@ -183,6 +183,17 @@ config NET_SCH_CBS To compile this code as a module, choose M here: the module will be called sch_cbs. +config NET_SCH_ETF + tristate "Earliest TxTime First (ETF)" + help + Say Y here if you want to use the Earliest TxTime First (ETF) packet + scheduling algorithm. + + See the top of for more details. + + To compile this code as a module, choose M here: the + module will be called sch_etf. + config NET_SCH_GRED tristate "Generic Random Early Detection (GRED)" ---help--- Index: linux-4.18.5-rt3/net/sched/Makefile =================================================================== --- linux-4.18.5-rt3.orig/net/sched/Makefile +++ linux-4.18.5-rt3/net/sched/Makefile @@ -54,6 +54,7 @@ obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o +obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o Index: linux-4.18.5-rt3/net/sched/sch_etf.c =================================================================== --- /dev/null +++ linux-4.18.5-rt3/net/sched/sch_etf.c @@ -0,0 +1,484 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* net/sched/sch_etf.c Earliest TxTime First queueing discipline. + * + * Authors: Jesus Sanchez-Palencia + * Vinicius Costa Gomes + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON) +#define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON) + +struct etf_sched_data { + bool offload; + bool deadline_mode; + int clockid; + int queue; + s32 delta; /* in ns */ + ktime_t last; /* The txtime of the last skb sent to the netdevice. */ + struct rb_root head; + struct qdisc_watchdog watchdog; + ktime_t (*get_time)(void); +}; + +static const struct nla_policy etf_policy[TCA_ETF_MAX + 1] = { + [TCA_ETF_PARMS] = { .len = sizeof(struct tc_etf_qopt) }, +}; + +static inline int validate_input_params(struct tc_etf_qopt *qopt, + struct netlink_ext_ack *extack) +{ + /* Check if params comply to the following rules: + * * Clockid and delta must be valid. + * + * * Dynamic clockids are not supported. + * + * * Delta must be a positive integer. + * + * Also note that for the HW offload case, we must + * expect that system clocks have been synchronized to PHC. + */ + if (qopt->clockid < 0) { + NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported"); + return -ENOTSUPP; + } + + if (qopt->clockid != CLOCK_TAI) { + NL_SET_ERR_MSG(extack, "Invalid clockid. CLOCK_TAI must be used"); + return -EINVAL; + } + + if (qopt->delta < 0) { + NL_SET_ERR_MSG(extack, "Delta must be positive"); + return -EINVAL; + } + + return 0; +} + +static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb) +{ + struct etf_sched_data *q = qdisc_priv(sch); + ktime_t txtime = nskb->tstamp; + struct sock *sk = nskb->sk; + ktime_t now; + + if (!sk) + return false; + + if (!sock_flag(sk, SOCK_TXTIME)) + return false; + + /* We don't perform crosstimestamping. + * Drop if packet's clockid differs from qdisc's. + */ + if (sk->sk_clockid != q->clockid) + return false; + + if (sk->sk_txtime_deadline_mode != q->deadline_mode) + return false; + + now = q->get_time(); + if (ktime_before(txtime, now) || ktime_before(txtime, q->last)) + return false; + + return true; +} + +static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node *p; + + p = rb_first(&q->head); + if (!p) + return NULL; + + return rb_to_skb(p); +} + +static void reset_watchdog(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = etf_peek_timesortedlist(sch); + ktime_t next; + + if (!skb) + return; + + next = ktime_sub_ns(skb->tstamp, q->delta); + qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); +} + +static void report_sock_error(struct sk_buff *skb, u32 err, u8 code) +{ + struct sock_exterr_skb *serr; + struct sk_buff *clone; + ktime_t txtime = skb->tstamp; + + if (!skb->sk || !(skb->sk->sk_txtime_report_errors)) + return; + + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) + return; + + serr = SKB_EXT_ERR(clone); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_TXTIME; + serr->ee.ee_type = 0; + serr->ee.ee_code = code; + serr->ee.ee_pad = 0; + serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */ + serr->ee.ee_info = txtime; /* low part of tstamp */ + + if (sock_queue_err_skb(skb->sk, clone)) + kfree_skb(clone); +} + +static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node **p = &q->head.rb_node, *parent = NULL; + ktime_t txtime = nskb->tstamp; + + if (!is_packet_valid(sch, nskb)) { + report_sock_error(nskb, EINVAL, + SO_EE_CODE_TXTIME_INVALID_PARAM); + return qdisc_drop(nskb, sch, to_free); + } + + while (*p) { + struct sk_buff *skb; + + parent = *p; + skb = rb_to_skb(parent); + if (ktime_after(txtime, skb->tstamp)) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&nskb->rbnode, parent, p); + rb_insert_color(&nskb->rbnode, &q->head); + + qdisc_qstats_backlog_inc(sch, nskb); + sch->q.qlen++; + + /* Now we may need to re-arm the qdisc watchdog for the next packet. */ + reset_watchdog(sch); + + return NET_XMIT_SUCCESS; +} + +static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb, + bool drop) +{ + struct etf_sched_data *q = qdisc_priv(sch); + + rb_erase(&skb->rbnode, &q->head); + + /* The rbnode field in the skb re-uses these fields, now that + * we are done with the rbnode, reset them. + */ + skb->next = NULL; + skb->prev = NULL; + skb->dev = qdisc_dev(sch); + + qdisc_qstats_backlog_dec(sch, skb); + + if (drop) { + struct sk_buff *to_free = NULL; + + report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED); + + qdisc_drop(skb, sch, &to_free); + kfree_skb_list(to_free); + qdisc_qstats_overlimit(sch); + } else { + qdisc_bstats_update(sch, skb); + + q->last = skb->tstamp; + } + + sch->q.qlen--; +} + +static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + ktime_t now, next; + + skb = etf_peek_timesortedlist(sch); + if (!skb) + return NULL; + + now = q->get_time(); + + /* Drop if packet has expired while in queue. */ + if (ktime_before(skb->tstamp, now)) { + timesortedlist_erase(sch, skb, true); + skb = NULL; + goto out; + } + + /* When in deadline mode, dequeue as soon as possible and change the + * txtime from deadline to (now + delta). + */ + if (q->deadline_mode) { + timesortedlist_erase(sch, skb, false); + skb->tstamp = now; + goto out; + } + + next = ktime_sub_ns(skb->tstamp, q->delta); + + /* Dequeue only if now is within the [txtime - delta, txtime] range. */ + if (ktime_after(now, next)) + timesortedlist_erase(sch, skb, false); + else + skb = NULL; + +out: + /* Now we may need to re-arm the qdisc watchdog for the next packet. */ + reset_watchdog(sch); + + return skb; +} + +static void etf_disable_offload(struct net_device *dev, + struct etf_sched_data *q) +{ + struct tc_etf_qopt_offload etf = { }; + const struct net_device_ops *ops; + int err; + + if (!q->offload) + return; + + ops = dev->netdev_ops; + if (!ops->ndo_setup_tc) + return; + + etf.queue = q->queue; + etf.enable = 0; + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); + if (err < 0) + pr_warn("Couldn't disable ETF offload for queue %d\n", + etf.queue); +} + +static int etf_enable_offload(struct net_device *dev, struct etf_sched_data *q, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct tc_etf_qopt_offload etf = { }; + int err; + + if (q->offload) + return 0; + + if (!ops->ndo_setup_tc) { + NL_SET_ERR_MSG(extack, "Specified device does not support ETF offload"); + return -EOPNOTSUPP; + } + + etf.queue = q->queue; + etf.enable = 1; + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Specified device failed to setup ETF hardware offload"); + return err; + } + + return 0; +} + +static int etf_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct nlattr *tb[TCA_ETF_MAX + 1]; + struct tc_etf_qopt *qopt; + int err; + + if (!opt) { + NL_SET_ERR_MSG(extack, + "Missing ETF qdisc options which are mandatory"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_ETF_MAX, opt, etf_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_ETF_PARMS]) { + NL_SET_ERR_MSG(extack, "Missing mandatory ETF parameters"); + return -EINVAL; + } + + qopt = nla_data(tb[TCA_ETF_PARMS]); + + pr_debug("delta %d clockid %d offload %s deadline %s\n", + qopt->delta, qopt->clockid, + OFFLOAD_IS_ON(qopt) ? "on" : "off", + DEADLINE_MODE_IS_ON(qopt) ? "on" : "off"); + + err = validate_input_params(qopt, extack); + if (err < 0) + return err; + + q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); + + if (OFFLOAD_IS_ON(qopt)) { + err = etf_enable_offload(dev, q, extack); + if (err < 0) + return err; + } + + /* Everything went OK, save the parameters used. */ + q->delta = qopt->delta; + q->clockid = qopt->clockid; + q->offload = OFFLOAD_IS_ON(qopt); + q->deadline_mode = DEADLINE_MODE_IS_ON(qopt); + + switch (q->clockid) { + case CLOCK_REALTIME: + q->get_time = ktime_get_real; + break; + case CLOCK_MONOTONIC: + q->get_time = ktime_get; + break; + case CLOCK_BOOTTIME: + q->get_time = ktime_get_boottime; + break; + case CLOCK_TAI: + q->get_time = ktime_get_clocktai; + break; + default: + NL_SET_ERR_MSG(extack, "Clockid is not supported"); + return -ENOTSUPP; + } + + qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid); + + return 0; +} + +static void timesortedlist_clear(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node *p = rb_first(&q->head); + + while (p) { + struct sk_buff *skb = rb_to_skb(p); + + p = rb_next(p); + + rb_erase(&skb->rbnode, &q->head); + rtnl_kfree_skbs(skb, skb); + sch->q.qlen--; + } +} + +static void etf_reset(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + + /* Only cancel watchdog if it's been initialized. */ + if (q->watchdog.qdisc == sch) + qdisc_watchdog_cancel(&q->watchdog); + + /* No matter which mode we are on, it's safe to clear both lists. */ + timesortedlist_clear(sch); + __qdisc_reset_queue(&sch->q); + + sch->qstats.backlog = 0; + sch->q.qlen = 0; + + q->last = 0; +} + +static void etf_destroy(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + + /* Only cancel watchdog if it's been initialized. */ + if (q->watchdog.qdisc == sch) + qdisc_watchdog_cancel(&q->watchdog); + + etf_disable_offload(dev, q); +} + +static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct tc_etf_qopt opt = { }; + struct nlattr *nest; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + opt.delta = q->delta; + opt.clockid = q->clockid; + if (q->offload) + opt.flags |= TC_ETF_OFFLOAD_ON; + + if (q->deadline_mode) + opt.flags |= TC_ETF_DEADLINE_MODE_ON; + + if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct Qdisc_ops etf_qdisc_ops __read_mostly = { + .id = "etf", + .priv_size = sizeof(struct etf_sched_data), + .enqueue = etf_enqueue_timesortedlist, + .dequeue = etf_dequeue_timesortedlist, + .peek = etf_peek_timesortedlist, + .init = etf_init, + .reset = etf_reset, + .destroy = etf_destroy, + .dump = etf_dump, + .owner = THIS_MODULE, +}; + +static int __init etf_module_init(void) +{ + return register_qdisc(&etf_qdisc_ops); +} + +static void __exit etf_module_exit(void) +{ + unregister_qdisc(&etf_qdisc_ops); +} +module_init(etf_module_init) +module_exit(etf_module_exit) +MODULE_LICENSE("GPL"); Index: linux-4.18.5-rt3/drivers/net/ethernet/intel/igb/igb_main.c =================================================================== --- linux-4.18.5-rt3.orig/drivers/net/ethernet/intel/igb/igb_main.c +++ linux-4.18.5-rt3/drivers/net/ethernet/intel/igb/igb_main.c @@ -1654,33 +1654,65 @@ static void set_queue_mode(struct e1000_ wr32(E1000_I210_TQAVCC(queue), val); } +static bool is_any_cbs_enabled(struct igb_adapter *adapter) +{ + int i; + + for (i = 0; i < adapter->num_tx_queues; i++) { + if (adapter->tx_ring[i]->cbs_enable) + return true; + } + + return false; +} + +static bool is_any_txtime_enabled(struct igb_adapter *adapter) +{ + int i; + + for (i = 0; i < adapter->num_tx_queues; i++) { + if (adapter->tx_ring[i]->launchtime_enable) + return true; + } + + return false; +} + /** - * igb_configure_cbs - Configure Credit-Based Shaper (CBS) + * igb_config_tx_modes - Configure "Qav Tx mode" features on igb * @adapter: pointer to adapter struct * @queue: queue number - * @enable: true = enable CBS, false = disable CBS - * @idleslope: idleSlope in kbps - * @sendslope: sendSlope in kbps - * @hicredit: hiCredit in bytes - * @locredit: loCredit in bytes * - * Configure CBS for a given hardware queue. When disabling, idleslope, - * sendslope, hicredit, locredit arguments are ignored. Returns 0 if - * success. Negative otherwise. - **/ -static void igb_configure_cbs(struct igb_adapter *adapter, int queue, - bool enable, int idleslope, int sendslope, - int hicredit, int locredit) + * Configure CBS and Launchtime for a given hardware queue. + * Parameters are retrieved from the correct Tx ring, so + * igb_save_cbs_params() and igb_save_txtime_params() should be used + * for setting those correctly prior to this function being called. + **/ +static void igb_config_tx_modes(struct igb_adapter *adapter, int queue) { + struct igb_ring *ring = adapter->tx_ring[queue]; struct net_device *netdev = adapter->netdev; struct e1000_hw *hw = &adapter->hw; - u32 tqavcc; + u32 tqavcc, tqavctrl; u16 value; WARN_ON(hw->mac.type != e1000_i210); WARN_ON(queue < 0 || queue > 1); - if (enable || queue == 0) { + /* If any of the Qav features is enabled, configure queues as SR and + * with HIGH PRIO. If none is, then configure them with LOW PRIO and + * as SP. + */ + if (ring->cbs_enable || ring->launchtime_enable) { + set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_HIGH); + set_queue_mode(hw, queue, QUEUE_MODE_STREAM_RESERVATION); + } else { + set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_LOW); + set_queue_mode(hw, queue, QUEUE_MODE_STRICT_PRIORITY); + } + + /* If CBS is enabled, set DataTranARB and config its parameters. */ + if (ring->cbs_enable || queue == 0) { /* i210 does not allow the queue 0 to be in the Strict * Priority mode while the Qav mode is enabled, so, * instead of disabling strict priority mode, we give @@ -1690,14 +1722,19 @@ static void igb_configure_cbs(struct igb * Queue0 QueueMode must be set to 1b when * TransmitMode is set to Qav." */ - if (queue == 0 && !enable) { + if (queue == 0 && !ring->cbs_enable) { /* max "linkspeed" idleslope in kbps */ - idleslope = 1000000; - hicredit = ETH_FRAME_LEN; + ring->idleslope = 1000000; + ring->hicredit = ETH_FRAME_LEN; } - set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_HIGH); - set_queue_mode(hw, queue, QUEUE_MODE_STREAM_RESERVATION); + /* Always set data transfer arbitration to credit-based + * shaper algorithm on TQAVCTRL if CBS is enabled for any of + * the queues. + */ + tqavctrl = rd32(E1000_I210_TQAVCTRL); + tqavctrl |= E1000_TQAVCTRL_DATATRANARB; + wr32(E1000_I210_TQAVCTRL, tqavctrl); /* According to i210 datasheet section 7.2.7.7, we should set * the 'idleSlope' field from TQAVCC register following the @@ -1756,17 +1793,16 @@ static void igb_configure_cbs(struct igb * calculated value, so the resulting bandwidth might * be slightly higher for some configurations. */ - value = DIV_ROUND_UP_ULL(idleslope * 61034ULL, 1000000); + value = DIV_ROUND_UP_ULL(ring->idleslope * 61034ULL, 1000000); tqavcc = rd32(E1000_I210_TQAVCC(queue)); tqavcc &= ~E1000_TQAVCC_IDLESLOPE_MASK; tqavcc |= value; wr32(E1000_I210_TQAVCC(queue), tqavcc); - wr32(E1000_I210_TQAVHC(queue), 0x80000000 + hicredit * 0x7735); + wr32(E1000_I210_TQAVHC(queue), + 0x80000000 + ring->hicredit * 0x7735); } else { - set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_LOW); - set_queue_mode(hw, queue, QUEUE_MODE_STRICT_PRIORITY); /* Set idleSlope to zero. */ tqavcc = rd32(E1000_I210_TQAVCC(queue)); @@ -1775,6 +1811,43 @@ static void igb_configure_cbs(struct igb /* Set hiCredit to zero. */ wr32(E1000_I210_TQAVHC(queue), 0); + + /* If CBS is not enabled for any queues anymore, then return to + * the default state of Data Transmission Arbitration on + * TQAVCTRL. + */ + if (!is_any_cbs_enabled(adapter)) { + tqavctrl = rd32(E1000_I210_TQAVCTRL); + tqavctrl &= ~E1000_TQAVCTRL_DATATRANARB; + wr32(E1000_I210_TQAVCTRL, tqavctrl); + } + } + + /* If LaunchTime is enabled, set DataTranTIM. */ + if (ring->launchtime_enable) { + /* Always set DataTranTIM on TQAVCTRL if LaunchTime is enabled + * for any of the SR queues, and configure fetchtime delta. + * XXX NOTE: + * - LaunchTime will be enabled for all SR queues. + * - A fixed offset can be added relative to the launch + * time of all packets if configured at reg LAUNCH_OS0. + * We are keeping it as 0 for now (default value). + */ + tqavctrl = rd32(E1000_I210_TQAVCTRL); + tqavctrl |= E1000_TQAVCTRL_DATATRANTIM | + E1000_TQAVCTRL_FETCHTIME_DELTA; + wr32(E1000_I210_TQAVCTRL, tqavctrl); + } else { + /* If Launchtime is not enabled for any SR queues anymore, + * then clear DataTranTIM on TQAVCTRL and clear fetchtime delta, + * effectively disabling Launchtime. + */ + if (!is_any_txtime_enabled(adapter)) { + tqavctrl = rd32(E1000_I210_TQAVCTRL); + tqavctrl &= ~E1000_TQAVCTRL_DATATRANTIM; + tqavctrl &= ~E1000_TQAVCTRL_FETCHTIME_DELTA; + wr32(E1000_I210_TQAVCTRL, tqavctrl); + } } /* XXX: In i210 controller the sendSlope and loCredit parameters from @@ -1782,9 +1855,27 @@ static void igb_configure_cbs(struct igb * configuration' in respect to these parameters. */ - netdev_dbg(netdev, "CBS %s: queue %d idleslope %d sendslope %d hiCredit %d locredit %d\n", - (enable) ? "enabled" : "disabled", queue, - idleslope, sendslope, hicredit, locredit); + netdev_dbg(netdev, "Qav Tx mode: cbs %s, launchtime %s, queue %d \ + idleslope %d sendslope %d hiCredit %d \ + locredit %d\n", + (ring->cbs_enable) ? "enabled" : "disabled", + (ring->launchtime_enable) ? "enabled" : "disabled", queue, + ring->idleslope, ring->sendslope, ring->hicredit, + ring->locredit); +} + +static int igb_save_txtime_params(struct igb_adapter *adapter, int queue, + bool enable) +{ + struct igb_ring *ring; + + if (queue < 0 || queue > adapter->num_tx_queues) + return -EINVAL; + + ring = adapter->tx_ring[queue]; + ring->launchtime_enable = enable; + + return 0; } static int igb_save_cbs_params(struct igb_adapter *adapter, int queue, @@ -1807,21 +1898,15 @@ static int igb_save_cbs_params(struct ig return 0; } -static bool is_any_cbs_enabled(struct igb_adapter *adapter) -{ - struct igb_ring *ring; - int i; - - for (i = 0; i < adapter->num_tx_queues; i++) { - ring = adapter->tx_ring[i]; - - if (ring->cbs_enable) - return true; - } - - return false; -} - +/** + * igb_setup_tx_mode - Switch to/from Qav Tx mode when applicable + * @adapter: pointer to adapter struct + * + * Configure TQAVCTRL register switching the controller's Tx mode + * if FQTSS mode is enabled or disabled. Additionally, will issue + * a call to igb_config_tx_modes() per queue so any previously saved + * Tx parameters are applied. + **/ static void igb_setup_tx_mode(struct igb_adapter *adapter) { struct net_device *netdev = adapter->netdev; @@ -1836,11 +1921,11 @@ static void igb_setup_tx_mode(struct igb int i, max_queue; /* Configure TQAVCTRL register: set transmit mode to 'Qav', - * set data fetch arbitration to 'round robin' and set data - * transfer arbitration to 'credit shaper algorithm. + * set data fetch arbitration to 'round robin', set SP_WAIT_SR + * so SP queues wait for SR ones. */ val = rd32(E1000_I210_TQAVCTRL); - val |= E1000_TQAVCTRL_XMIT_MODE | E1000_TQAVCTRL_DATATRANARB; + val |= E1000_TQAVCTRL_XMIT_MODE | E1000_TQAVCTRL_SP_WAIT_SR; val &= ~E1000_TQAVCTRL_DATAFETCHARB; wr32(E1000_I210_TQAVCTRL, val); @@ -1881,11 +1966,7 @@ static void igb_setup_tx_mode(struct igb adapter->num_tx_queues : I210_SR_QUEUES_NUM; for (i = 0; i < max_queue; i++) { - struct igb_ring *ring = adapter->tx_ring[i]; - - igb_configure_cbs(adapter, i, ring->cbs_enable, - ring->idleslope, ring->sendslope, - ring->hicredit, ring->locredit); + igb_config_tx_modes(adapter, i); } } else { wr32(E1000_RXPBS, I210_RXPBSIZE_DEFAULT); @@ -2459,6 +2540,19 @@ igb_features_check(struct sk_buff *skb, return features; } +static void igb_offload_apply(struct igb_adapter *adapter, s32 queue) +{ + if (!is_fqtss_enabled(adapter)) { + enable_fqtss(adapter, true); + return; + } + + igb_config_tx_modes(adapter, queue); + + if (!is_any_cbs_enabled(adapter) && !is_any_txtime_enabled(adapter)) + enable_fqtss(adapter, false); +} + static int igb_offload_cbs(struct igb_adapter *adapter, struct tc_cbs_qopt_offload *qopt) { @@ -2479,17 +2573,7 @@ static int igb_offload_cbs(struct igb_ad if (err) return err; - if (is_fqtss_enabled(adapter)) { - igb_configure_cbs(adapter, qopt->queue, qopt->enable, - qopt->idleslope, qopt->sendslope, - qopt->hicredit, qopt->locredit); - - if (!is_any_cbs_enabled(adapter)) - enable_fqtss(adapter, false); - - } else { - enable_fqtss(adapter, true); - } + igb_offload_apply(adapter, qopt->queue); return 0; } @@ -2738,6 +2822,29 @@ static int igb_setup_tc_block(struct igb } } +static int igb_offload_txtime(struct igb_adapter *adapter, + struct tc_etf_qopt_offload *qopt) +{ + struct e1000_hw *hw = &adapter->hw; + int err; + + /* Launchtime offloading is only supported by i210 controller. */ + if (hw->mac.type != e1000_i210) + return -EOPNOTSUPP; + + /* Launchtime offloading is only supported by queues 0 and 1. */ + if (qopt->queue < 0 || qopt->queue > 1) + return -EINVAL; + + err = igb_save_txtime_params(adapter, qopt->queue, qopt->enable); + if (err) + return err; + + igb_offload_apply(adapter, qopt->queue); + + return 0; +} + static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { @@ -2748,6 +2855,8 @@ static int igb_setup_tc(struct net_devic return igb_offload_cbs(adapter, type_data); case TC_SETUP_BLOCK: return igb_setup_tc_block(adapter, type_data); + case TC_SETUP_QDISC_ETF: + return igb_offload_txtime(adapter, type_data); default: return -EOPNOTSUPP; @@ -5568,11 +5677,14 @@ set_itr_now: } } -static void igb_tx_ctxtdesc(struct igb_ring *tx_ring, u32 vlan_macip_lens, - u32 type_tucmd, u32 mss_l4len_idx) +static void igb_tx_ctxtdesc(struct igb_ring *tx_ring, + struct igb_tx_buffer *first, + u32 vlan_macip_lens, u32 type_tucmd, + u32 mss_l4len_idx) { struct e1000_adv_tx_context_desc *context_desc; u16 i = tx_ring->next_to_use; + struct timespec64 ts; context_desc = IGB_TX_CTXTDESC(tx_ring, i); @@ -5587,9 +5699,18 @@ static void igb_tx_ctxtdesc(struct igb_r mss_l4len_idx |= tx_ring->reg_idx << 4; context_desc->vlan_macip_lens = cpu_to_le32(vlan_macip_lens); - context_desc->seqnum_seed = 0; context_desc->type_tucmd_mlhl = cpu_to_le32(type_tucmd); context_desc->mss_l4len_idx = cpu_to_le32(mss_l4len_idx); + + /* We assume there is always a valid tx time available. Invalid times + * should have been handled by the upper layers. + */ + if (tx_ring->launchtime_enable) { + ts = ns_to_timespec64(first->skb->tstamp); + context_desc->seqnum_seed = cpu_to_le32(ts.tv_nsec / 32); + } else { + context_desc->seqnum_seed = 0; + } } static int igb_tso(struct igb_ring *tx_ring, @@ -5672,7 +5793,8 @@ static int igb_tso(struct igb_ring *tx_r vlan_macip_lens |= (ip.hdr - skb->data) << E1000_ADVTXD_MACLEN_SHIFT; vlan_macip_lens |= first->tx_flags & IGB_TX_FLAGS_VLAN_MASK; - igb_tx_ctxtdesc(tx_ring, vlan_macip_lens, type_tucmd, mss_l4len_idx); + igb_tx_ctxtdesc(tx_ring, first, vlan_macip_lens, + type_tucmd, mss_l4len_idx); return 1; } @@ -5727,7 +5849,7 @@ no_csum: vlan_macip_lens |= skb_network_offset(skb) << E1000_ADVTXD_MACLEN_SHIFT; vlan_macip_lens |= first->tx_flags & IGB_TX_FLAGS_VLAN_MASK; - igb_tx_ctxtdesc(tx_ring, vlan_macip_lens, type_tucmd, 0); + igb_tx_ctxtdesc(tx_ring, first, vlan_macip_lens, type_tucmd, 0); } #define IGB_SET_FLAG(_input, _flag, _result) \ @@ -6015,8 +6137,6 @@ netdev_tx_t igb_xmit_frame_ring(struct s } } - skb_tx_timestamp(skb); - if (skb_vlan_tag_present(skb)) { tx_flags |= IGB_TX_FLAGS_VLAN; tx_flags |= (skb_vlan_tag_get(skb) << IGB_TX_FLAGS_VLAN_SHIFT); @@ -6032,6 +6152,8 @@ netdev_tx_t igb_xmit_frame_ring(struct s else if (!tso) igb_tx_csum(tx_ring, first); + skb_tx_timestamp(skb); + if (igb_tx_map(tx_ring, first, hdr_len)) goto cleanup_tx_tstamp; Index: linux-4.18.5-rt3/drivers/net/ethernet/intel/igb/e1000_defines.h =================================================================== --- linux-4.18.5-rt3.orig/drivers/net/ethernet/intel/igb/e1000_defines.h +++ linux-4.18.5-rt3/drivers/net/ethernet/intel/igb/e1000_defines.h @@ -1048,6 +1048,22 @@ #define E1000_TQAVCTRL_XMIT_MODE BIT(0) #define E1000_TQAVCTRL_DATAFETCHARB BIT(4) #define E1000_TQAVCTRL_DATATRANARB BIT(8) +#define E1000_TQAVCTRL_DATATRANTIM BIT(9) +#define E1000_TQAVCTRL_SP_WAIT_SR BIT(10) +/* Fetch Time Delta - bits 31:16 + * + * This field holds the value to be reduced from the launch time for + * fetch time decision. The FetchTimeDelta value is defined in 32 ns + * granularity. + * + * This field is 16 bits wide, and so the maximum value is: + * + * 65535 * 32 = 2097120 ~= 2.1 msec + * + * XXX: We are configuring the max value here since we couldn't come up + * with a reason for not doing so. + */ +#define E1000_TQAVCTRL_FETCHTIME_DELTA (0xFFFF << 16) /* TX Qav Credit Control fields */ #define E1000_TQAVCC_IDLESLOPE_MASK 0xFFFF Index: linux-4.18.5-rt3/drivers/net/ethernet/intel/igb/igb.h =================================================================== --- linux-4.18.5-rt3.orig/drivers/net/ethernet/intel/igb/igb.h +++ linux-4.18.5-rt3/drivers/net/ethernet/intel/igb/igb.h @@ -262,6 +262,7 @@ struct igb_ring { u16 count; /* number of desc. in the ring */ u8 queue_index; /* logical index of the ring*/ u8 reg_idx; /* physical index of the ring */ + bool launchtime_enable; /* true if LaunchTime is enabled */ bool cbs_enable; /* indicates if CBS is enabled */ s32 idleslope; /* idleSlope in kbps */ s32 sendslope; /* sendSlope in kbps */ Index: linux-4.18.5-rt3/include/uapi/linux/errqueue.h =================================================================== --- linux-4.18.5-rt3.orig/include/uapi/linux/errqueue.h +++ linux-4.18.5-rt3/include/uapi/linux/errqueue.h @@ -20,12 +20,16 @@ struct sock_extended_err { #define SO_EE_ORIGIN_ICMP6 3 #define SO_EE_ORIGIN_TXSTATUS 4 #define SO_EE_ORIGIN_ZEROCOPY 5 +#define SO_EE_ORIGIN_TXTIME 6 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS #define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1)) #define SO_EE_CODE_ZEROCOPY_COPIED 1 +#define SO_EE_CODE_TXTIME_INVALID_PARAM 1 +#define SO_EE_CODE_TXTIME_MISSED 2 + /** * struct scm_timestamping - timestamps exposed through cmsg *