From patchwork Wed Mar  7 01:12:13 2018
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Subject: [RFC,v3,net-next,01/18] sock: Fix SO_ZEROCOPY switch case
X-Patchwork-Submitter: Jesus Sanchez-Palencia
 <jesus.sanchez-palencia@intel.com>
X-Patchwork-Id: 882329
X-Patchwork-Delegate: davem@davemloft.net
Message-Id: <20180307011230.24001-2-jesus.sanchez-palencia@intel.com>
To: netdev@vger.kernel.org
Cc: jhs@mojatatu.com, xiyou.wangcong@gmail.com, jiri@resnulli.us,
 vinicius.gomes@intel.com, richardcochran@gmail.com,
 intel-wired-lan@lists.osuosl.org, anna-maria@linutronix.de,
 henrik@austad.us, tglx@linutronix.de, john.stultz@linaro.org,
 levi.pearson@harman.com, edumazet@google.com, willemb@google.com,
 mlichvar@redhat.com,
 Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Date: Tue,  6 Mar 2018 17:12:13 -0800
From: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
List-Id: <netdev.vger.kernel.org>

Fix the SO_ZEROCOPY switch case on sock_setsockopt() avoiding the
ret values to be overwritten by the one set on the default case.

Fixes: 28190752c7092 ("sock: permit SO_ZEROCOPY on PF_RDS socket")
Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Acked-by: Willem de Bruijn <willemb@google.com>
---
 arch/alpha/include/uapi/asm/socket.h           |    5 
 arch/frv/include/uapi/asm/socket.h             |    5 
 arch/ia64/include/uapi/asm/socket.h            |    5 
 arch/m32r/include/uapi/asm/socket.h            |    5 
 arch/mips/include/uapi/asm/socket.h            |    5 
 arch/mn10300/include/uapi/asm/socket.h         |    5 
 arch/parisc/include/uapi/asm/socket.h          |    5 
 arch/s390/include/uapi/asm/socket.h            |    5 
 arch/sparc/include/uapi/asm/socket.h           |    5 
 arch/xtensa/include/uapi/asm/socket.h          |    5 
 drivers/net/ethernet/intel/igb/e1000_defines.h |   16 
 drivers/net/ethernet/intel/igb/igb.h           |    1 
 drivers/net/ethernet/intel/igb/igb_main.c      |  243 +++++++---
 include/linux/netdevice.h                      |    2 
 include/linux/posix-timers.h                   |    1 
 include/linux/skbuff.h                         |    3 
 include/net/pkt_sched.h                        |    7 
 include/net/sock.h                             |    4 
 include/uapi/asm-generic/socket.h              |    5 
 include/uapi/linux/pkt_sched.h                 |   18 
 net/core/skbuff.c                              |    1 
 net/core/sock.c                                |   41 +
 net/ipv4/raw.c                                 |    7 
 net/ipv4/udp.c                                 |   10 
 net/packet/af_packet.c                         |   19 
 net/sched/Kconfig                              |   11 
 net/sched/Makefile                             |    1 
 net/sched/sch_api.c                            |   11 
 net/sched/sch_tbs.c                            |  591 +++++++++++++++++++++++++
 29 files changed, 978 insertions(+), 64 deletions(-)

Index: linux-4.16.12-rt5/arch/alpha/include/uapi/asm/socket.h
===================================================================
--- linux-4.16.12-rt5.orig/arch/alpha/include/uapi/asm/socket.h
+++ linux-4.16.12-rt5/arch/alpha/include/uapi/asm/socket.h
@@ -112,4 +112,9 @@
 
 #define SO_ZEROCOPY            60
 
+#define SO_TXTIME              61
+#define SCM_TXTIME             SO_TXTIME
+#define SCM_DROP_IF_LATE       62
+#define SCM_CLOCKID            63
+
 #endif /* _UAPI_ASM_SOCKET_H */
Index: linux-4.16.12-rt5/arch/frv/include/uapi/asm/socket.h
===================================================================
--- linux-4.16.12-rt5.orig/arch/frv/include/uapi/asm/socket.h
+++ linux-4.16.12-rt5/arch/frv/include/uapi/asm/socket.h
@@ -105,5 +105,10 @@
 
 #define SO_ZEROCOPY            60
 
+#define SO_TXTIME              61
+#define SCM_TXTIME             SO_TXTIME
+#define SCM_DROP_IF_LATE       62
+#define SCM_CLOCKID            63
+
 #endif /* _ASM_SOCKET_H */
 
Index: linux-4.16.12-rt5/arch/ia64/include/uapi/asm/socket.h
===================================================================
--- linux-4.16.12-rt5.orig/arch/ia64/include/uapi/asm/socket.h
+++ linux-4.16.12-rt5/arch/ia64/include/uapi/asm/socket.h
@@ -114,4 +114,9 @@
 
 #define SO_ZEROCOPY            60
 
+#define SO_TXTIME              61
+#define SCM_TXTIME             SO_TXTIME
+#define SCM_DROP_IF_LATE       62
+#define SCM_CLOCKID            63
+
 #endif /* _ASM_IA64_SOCKET_H */
Index: linux-4.16.12-rt5/arch/m32r/include/uapi/asm/socket.h
===================================================================
--- linux-4.16.12-rt5.orig/arch/m32r/include/uapi/asm/socket.h
+++ linux-4.16.12-rt5/arch/m32r/include/uapi/asm/socket.h
@@ -105,4 +105,9 @@
 
 #define SO_ZEROCOPY            60
 
+#define SO_TXTIME              61
+#define SCM_TXTIME             SO_TXTIME
+#define SCM_DROP_IF_LATE       62
+#define SCM_CLOCKID            63
+
 #endif /* _ASM_M32R_SOCKET_H */
Index: linux-4.16.12-rt5/arch/mips/include/uapi/asm/socket.h
===================================================================
--- linux-4.16.12-rt5.orig/arch/mips/include/uapi/asm/socket.h
+++ linux-4.16.12-rt5/arch/mips/include/uapi/asm/socket.h
@@ -123,4 +123,9 @@
 
 #define SO_ZEROCOPY            60
 
+#define SO_TXTIME              61
+#define SCM_TXTIME             SO_TXTIME
+#define SCM_DROP_IF_LATE       62
+#define SCM_CLOCKID            63
+
 #endif /* _UAPI_ASM_SOCKET_H */
Index: linux-4.16.12-rt5/arch/mn10300/include/uapi/asm/socket.h
===================================================================
--- linux-4.16.12-rt5.orig/arch/mn10300/include/uapi/asm/socket.h
+++ linux-4.16.12-rt5/arch/mn10300/include/uapi/asm/socket.h
@@ -105,4 +105,9 @@
 
 #define SO_ZEROCOPY            60
 
+#define SO_TXTIME              61
+#define SCM_TXTIME             SO_TXTIME
+#define SCM_DROP_IF_LATE       62
+#define SCM_CLOCKID            63
+
 #endif /* _ASM_SOCKET_H */
Index: linux-4.16.12-rt5/arch/parisc/include/uapi/asm/socket.h
===================================================================
--- linux-4.16.12-rt5.orig/arch/parisc/include/uapi/asm/socket.h
+++ linux-4.16.12-rt5/arch/parisc/include/uapi/asm/socket.h
@@ -104,4 +104,9 @@
 
 #define SO_ZEROCOPY            0x4035
 
+#define SO_TXTIME              0x4036
+#define SCM_TXTIME             SO_TXTIME
+#define SCM_DROP_IF_LATE       0x4037
+#define SCM_CLOCKID            0x4038
+
 #endif /* _UAPI_ASM_SOCKET_H */
Index: linux-4.16.12-rt5/arch/s390/include/uapi/asm/socket.h
===================================================================
--- linux-4.16.12-rt5.orig/arch/s390/include/uapi/asm/socket.h
+++ linux-4.16.12-rt5/arch/s390/include/uapi/asm/socket.h
@@ -111,4 +111,9 @@
 
 #define SO_ZEROCOPY            60
 
+#define SO_TXTIME              61
+#define SCM_TXTIME             SO_TXTIME
+#define SCM_DROP_IF_LATE       62
+#define SCM_CLOCKID            63
+
 #endif /* _ASM_SOCKET_H */
Index: linux-4.16.12-rt5/arch/sparc/include/uapi/asm/socket.h
===================================================================
--- linux-4.16.12-rt5.orig/arch/sparc/include/uapi/asm/socket.h
+++ linux-4.16.12-rt5/arch/sparc/include/uapi/asm/socket.h
@@ -101,6 +101,11 @@
 
 #define SO_ZEROCOPY            0x003e
 
+#define SO_TXTIME              0x003f
+#define SCM_TXTIME             SO_TXTIME
+#define SCM_DROP_IF_LATE       0x0040
+#define SCM_CLOCKID            0x0041
+
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION             0x5001
 #define SO_SECURITY_ENCRYPTION_TRANSPORT       0x5002
Index: linux-4.16.12-rt5/arch/xtensa/include/uapi/asm/socket.h
===================================================================
--- linux-4.16.12-rt5.orig/arch/xtensa/include/uapi/asm/socket.h
+++ linux-4.16.12-rt5/arch/xtensa/include/uapi/asm/socket.h
@@ -116,4 +116,9 @@
 
 #define SO_ZEROCOPY            60
 
+#define SO_TXTIME              61
+#define SCM_TXTIME             SO_TXTIME
+#define SCM_DROP_IF_LATE       62
+#define SCM_CLOCKID            63
+
 #endif /* _XTENSA_SOCKET_H */
Index: linux-4.16.12-rt5/drivers/net/ethernet/intel/igb/e1000_defines.h
===================================================================
--- linux-4.16.12-rt5.orig/drivers/net/ethernet/intel/igb/e1000_defines.h
+++ linux-4.16.12-rt5/drivers/net/ethernet/intel/igb/e1000_defines.h
@@ -1066,6 +1066,22 @@
 #define E1000_TQAVCTRL_XMIT_MODE       BIT(0)
 #define E1000_TQAVCTRL_DATAFETCHARB    BIT(4)
 #define E1000_TQAVCTRL_DATATRANARB     BIT(8)
+#define E1000_TQAVCTRL_DATATRANTIM     BIT(9)
+#define E1000_TQAVCTRL_SP_WAIT_SR      BIT(10)
+/* Fetch Time Delta - bits 31:16
+ *
+ * This field holds the value to be reduced from the launch time for
+ * fetch time decision. The FetchTimeDelta value is defined in 32 ns
+ * granularity.
+ *
+ * This field is 16 bits wide, and so the maximum value is:
+ *
+ * 65535 * 32 = 2097120 ~= 2.1 msec
+ *
+ * XXX: We are configuring the max value here since we couldn't come up
+ * with a reason for not doing so.
+ */
+#define E1000_TQAVCTRL_FETCHTIME_DELTA (0xFFFF << 16)
 
 /* TX Qav Credit Control fields */
 #define E1000_TQAVCC_IDLESLOPE_MASK    0xFFFF
Index: linux-4.16.12-rt5/drivers/net/ethernet/intel/igb/igb.h
===================================================================
--- linux-4.16.12-rt5.orig/drivers/net/ethernet/intel/igb/igb.h
+++ linux-4.16.12-rt5/drivers/net/ethernet/intel/igb/igb.h
@@ -281,6 +281,7 @@ struct igb_ring {
        u16 count;                      /* number of desc. in the ring */
        u8 queue_index;                 /* logical index of the ring*/
        u8 reg_idx;                     /* physical index of the ring */
+       bool launchtime_enable;         /* true if LaunchTime is enabled */
        bool cbs_enable;                /* indicates if CBS is enabled */
        s32 idleslope;                  /* idleSlope in kbps */
        s32 sendslope;                  /* sendSlope in kbps */
Index: linux-4.16.12-rt5/drivers/net/ethernet/intel/igb/igb_main.c
===================================================================
--- linux-4.16.12-rt5.orig/drivers/net/ethernet/intel/igb/igb_main.c
+++ linux-4.16.12-rt5/drivers/net/ethernet/intel/igb/igb_main.c
@@ -1672,35 +1672,72 @@ static void set_queue_mode(struct e1000_
        wr32(E1000_I210_TQAVCC(queue), val);
 }
 
+static bool is_any_cbs_enabled(struct igb_adapter *adapter)
+{
+       int i;
+
+       for (i = 0; i < adapter->num_tx_queues; i++) {
+               if (adapter->tx_ring[i]->cbs_enable)
+                       return true;
+       }
+
+       return false;
+}
+
+static bool is_any_txtime_enabled(struct igb_adapter *adapter)
+{
+       int i;
+
+       for (i = 0; i < adapter->num_tx_queues; i++) {
+               if (adapter->tx_ring[i]->launchtime_enable)
+                       return true;
+       }
+
+       return false;
+}
+
 /**
- *  igb_configure_cbs - Configure Credit-Based Shaper (CBS)
+ *  igb_config_tx_modes - Configure "Qav Tx mode" features on igb
  *  @adapter: pointer to adapter struct
  *  @queue: queue number
- *  @enable: true = enable CBS, false = disable CBS
- *  @idleslope: idleSlope in kbps
- *  @sendslope: sendSlope in kbps
- *  @hicredit: hiCredit in bytes
- *  @locredit: loCredit in bytes
- *
- *  Configure CBS for a given hardware queue. When disabling, idleslope,
- *  sendslope, hicredit, locredit arguments are ignored. Returns 0 if
- *  success. Negative otherwise.
- **/
-static void igb_configure_cbs(struct igb_adapter *adapter, int queue,
-                             bool enable, int idleslope, int sendslope,
-                             int hicredit, int locredit)
+ *
+ *  Configure CBS and Launchtime for a given hardware queue.
+ *  Parameters are retrieved from the correct Tx ring, so
+ *  igb_save_cbs_params() and igb_save_txtime_params() should be used
+ *  for setting those correctly prior to this function being called.
+ **/
+static void igb_config_tx_modes(struct igb_adapter *adapter, int queue)
 {
+       struct igb_ring *ring = adapter->tx_ring[queue];
        struct net_device *netdev = adapter->netdev;
        struct e1000_hw *hw = &adapter->hw;
-       u32 tqavcc;
+       u32 tqavcc, tqavctrl;
        u16 value;
 
        WARN_ON(hw->mac.type != e1000_i210);
        WARN_ON(queue < 0 || queue > 1);
 
-       if (enable) {
+       /* If any of the Qav features is enabled, configure queues as SR and
+        * with HIGH PRIO. If none is, then configure them with LOW PRIO and
+        * as SP.
+        */
+       if (ring->cbs_enable || ring->launchtime_enable) {
                set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_HIGH);
                set_queue_mode(hw, queue, QUEUE_MODE_STREAM_RESERVATION);
+       } else {
+               set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_LOW);
+               set_queue_mode(hw, queue, QUEUE_MODE_STRICT_PRIORITY);
+       }
+
+       /* If CBS is enabled, set DataTranARB and config its parameters. */
+       if (ring->cbs_enable) {
+               /* Always set data transfer arbitration to credit-based
+                * shaper algorithm on TQAVCTRL if CBS is enabled for any of
+                * the queues.
+                */
+               tqavctrl = rd32(E1000_I210_TQAVCTRL);
+               tqavctrl |= E1000_TQAVCTRL_DATATRANARB;
+               wr32(E1000_I210_TQAVCTRL, tqavctrl);
 
                /* According to i210 datasheet section 7.2.7.7, we should set
                 * the 'idleSlope' field from TQAVCC register following the
@@ -1759,17 +1796,16 @@ static void igb_configure_cbs(struct igb
                 *       calculated value, so the resulting bandwidth might
                 *       be slightly higher for some configurations.
                 */
-               value = DIV_ROUND_UP_ULL(idleslope * 61034ULL, 1000000);
+               value = DIV_ROUND_UP_ULL(ring->idleslope * 61034ULL, 1000000);
 
                tqavcc = rd32(E1000_I210_TQAVCC(queue));
                tqavcc &= ~E1000_TQAVCC_IDLESLOPE_MASK;
                tqavcc |= value;
                wr32(E1000_I210_TQAVCC(queue), tqavcc);
 
-               wr32(E1000_I210_TQAVHC(queue), 0x80000000 + hicredit * 0x7735);
+               wr32(E1000_I210_TQAVHC(queue),
+                    0x80000000 + ring->hicredit * 0x7735);
        } else {
-               set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_LOW);
-               set_queue_mode(hw, queue, QUEUE_MODE_STRICT_PRIORITY);
 
                /* Set idleSlope to zero. */
                tqavcc = rd32(E1000_I210_TQAVCC(queue));
@@ -1778,6 +1814,43 @@ static void igb_configure_cbs(struct igb
 
                /* Set hiCredit to zero. */
                wr32(E1000_I210_TQAVHC(queue), 0);
+
+               /* If CBS is not enabled for any queues anymore, then return to
+                * the default state of Data Transmission Arbitration on
+                * TQAVCTRL.
+                */
+               if (!is_any_cbs_enabled(adapter)) {
+                       tqavctrl = rd32(E1000_I210_TQAVCTRL);
+                       tqavctrl &= ~E1000_TQAVCTRL_DATATRANARB;
+                       wr32(E1000_I210_TQAVCTRL, tqavctrl);
+               }
+       }
+
+       /* If LaunchTime is enabled, set DataTranTIM. */
+       if (ring->launchtime_enable) {
+               /* Always set DataTranTIM on TQAVCTRL if LaunchTime is enabled
+                * for any of the SR queues, and configure fetchtime delta.
+                * XXX NOTE:
+                *     - LaunchTime will be enabled for all SR queues.
+                *     - A fixed offset can be added relative to the launch
+                *       time of all packets if configured at reg LAUNCH_OS0.
+                *       We are keeping it as 0 for now (default value).
+                */
+               tqavctrl = rd32(E1000_I210_TQAVCTRL);
+               tqavctrl |= E1000_TQAVCTRL_DATATRANTIM |
+                      E1000_TQAVCTRL_FETCHTIME_DELTA;
+               wr32(E1000_I210_TQAVCTRL, tqavctrl);
+       } else {
+               /* If Launchtime is not enabled for any SR queues anymore,
+                * then clear DataTranTIM on TQAVCTRL and clear fetchtime delta,
+                * effectively disabling Launchtime.
+                */
+               if (!is_any_txtime_enabled(adapter)) {
+                       tqavctrl = rd32(E1000_I210_TQAVCTRL);
+                       tqavctrl &= ~E1000_TQAVCTRL_DATATRANTIM;
+                       tqavctrl &= ~E1000_TQAVCTRL_FETCHTIME_DELTA;
+                       wr32(E1000_I210_TQAVCTRL, tqavctrl);
+               }
        }
 
        /* XXX: In i210 controller the sendSlope and loCredit parameters from
@@ -1785,9 +1858,27 @@ static void igb_configure_cbs(struct igb
         * configuration' in respect to these parameters.
         */
 
-       netdev_dbg(netdev, "CBS %s: queue %d idleslope %d sendslope %d hiCredit %d locredit %d\n",
-                  (enable) ? "enabled" : "disabled", queue,
-                  idleslope, sendslope, hicredit, locredit);
+       netdev_dbg(netdev, "Qav Tx mode: cbs %s, launchtime %s, queue %d \
+                           idleslope %d sendslope %d hiCredit %d \
+                           locredit %d\n",
+                  (ring->cbs_enable) ? "enabled" : "disabled",
+                  (ring->launchtime_enable) ? "enabled" : "disabled", queue,
+                  ring->idleslope, ring->sendslope, ring->hicredit,
+                  ring->locredit);
+}
+
+static int igb_save_txtime_params(struct igb_adapter *adapter, int queue,
+                                 bool enable)
+{
+       struct igb_ring *ring;
+
+       if (queue < 0 || queue > adapter->num_tx_queues)
+               return -EINVAL;
+
+       ring = adapter->tx_ring[queue];
+       ring->launchtime_enable = enable;
+
+       return 0;
 }
 
 static int igb_save_cbs_params(struct igb_adapter *adapter, int queue,
@@ -1810,21 +1901,15 @@ static int igb_save_cbs_params(struct ig
        return 0;
 }
 
-static bool is_any_cbs_enabled(struct igb_adapter *adapter)
-{
-       struct igb_ring *ring;
-       int i;
-
-       for (i = 0; i < adapter->num_tx_queues; i++) {
-               ring = adapter->tx_ring[i];
-
-               if (ring->cbs_enable)
-                       return true;
-       }
-
-       return false;
-}
-
+/**
+ *  igb_setup_tx_mode - Switch to/from Qav Tx mode when applicable
+ *  @adapter: pointer to adapter struct
+ *
+ *  Configure TQAVCTRL register switching the controller's Tx mode
+ *  if FQTSS mode is enabled or disabled. Additionally, will issue
+ *  a call to igb_config_tx_modes() per queue so any previously saved
+ *  Tx parameters are applied.
+ **/
 static void igb_setup_tx_mode(struct igb_adapter *adapter)
 {
        struct net_device *netdev = adapter->netdev;
@@ -1839,11 +1924,11 @@ static void igb_setup_tx_mode(struct igb
                int i, max_queue;
 
                /* Configure TQAVCTRL register: set transmit mode to 'Qav',
-                * set data fetch arbitration to 'round robin' and set data
-                * transfer arbitration to 'credit shaper algorithm.
+                * set data fetch arbitration to 'round robin', set SP_WAIT_SR
+                * so SP queues wait for SR ones.
                 */
                val = rd32(E1000_I210_TQAVCTRL);
-               val |= E1000_TQAVCTRL_XMIT_MODE | E1000_TQAVCTRL_DATATRANARB;
+               val |= E1000_TQAVCTRL_XMIT_MODE | E1000_TQAVCTRL_SP_WAIT_SR;
                val &= ~E1000_TQAVCTRL_DATAFETCHARB;
                wr32(E1000_I210_TQAVCTRL, val);
 
@@ -1884,11 +1969,7 @@ static void igb_setup_tx_mode(struct igb
                            adapter->num_tx_queues : I210_SR_QUEUES_NUM;
 
                for (i = 0; i < max_queue; i++) {
-                       struct igb_ring *ring = adapter->tx_ring[i];
-
-                       igb_configure_cbs(adapter, i, ring->cbs_enable,
-                                         ring->idleslope, ring->sendslope,
-                                         ring->hicredit, ring->locredit);
+                       igb_config_tx_modes(adapter, i);
                }
        } else {
                wr32(E1000_RXPBS, I210_RXPBSIZE_DEFAULT);
@@ -2461,6 +2542,19 @@ igb_features_check(struct sk_buff *skb,
        return features;
 }
 
+static void igb_offload_apply(struct igb_adapter *adapter, s32 queue)
+{
+       if (!is_fqtss_enabled(adapter)) {
+               enable_fqtss(adapter, true);
+               return;
+       }
+
+       igb_config_tx_modes(adapter, queue);
+
+       if (!is_any_cbs_enabled(adapter) && !is_any_txtime_enabled(adapter))
+               enable_fqtss(adapter, false);
+}
+
 static int igb_offload_cbs(struct igb_adapter *adapter,
                           struct tc_cbs_qopt_offload *qopt)
 {
@@ -2481,17 +2575,31 @@ static int igb_offload_cbs(struct igb_ad
        if (err)
                return err;
 
-       if (is_fqtss_enabled(adapter)) {
-               igb_configure_cbs(adapter, qopt->queue, qopt->enable,
-                                 qopt->idleslope, qopt->sendslope,
-                                 qopt->hicredit, qopt->locredit);
+       igb_offload_apply(adapter, qopt->queue);
 
-               if (!is_any_cbs_enabled(adapter))
-                       enable_fqtss(adapter, false);
+       return 0;
+}
 
-       } else {
-               enable_fqtss(adapter, true);
-       }
+static int igb_offload_txtime(struct igb_adapter *adapter,
+                             struct tc_tbs_qopt_offload *qopt)
+{
+       struct e1000_hw *hw = &adapter->hw;
+       int err;
+
+       /* Launchtime offloading is only supported by i210 controller. */
+       if (hw->mac.type != e1000_i210)
+               return -EOPNOTSUPP;
+
+       /* Launchtime offloading is only supported by queues 0 and 1. */
+       if (qopt->queue < 0 || qopt->queue > 1)
+               return -EINVAL;
+
+       err = igb_save_txtime_params(adapter, qopt->queue, qopt->enable);
+
+       if (err)
+               return err;
+
+       igb_offload_apply(adapter, qopt->queue);
 
        return 0;
 }
@@ -2504,6 +2612,8 @@ static int igb_setup_tc(struct net_devic
        switch (type) {
        case TC_SETUP_QDISC_CBS:
                return igb_offload_cbs(adapter, type_data);
+       case TC_SETUP_QDISC_TBS:
+               return igb_offload_txtime(adapter, type_data);
 
        default:
                return -EOPNOTSUPP;
@@ -5315,11 +5425,14 @@ set_itr_now:
        }
 }
 
-static void igb_tx_ctxtdesc(struct igb_ring *tx_ring, u32 vlan_macip_lens,
-                           u32 type_tucmd, u32 mss_l4len_idx)
+static void igb_tx_ctxtdesc(struct igb_ring *tx_ring,
+                           struct igb_tx_buffer *first,
+                           u32 vlan_macip_lens, u32 type_tucmd,
+                           u32 mss_l4len_idx)
 {
        struct e1000_adv_tx_context_desc *context_desc;
        u16 i = tx_ring->next_to_use;
+       struct timespec64 ts;
 
        context_desc = IGB_TX_CTXTDESC(tx_ring, i);
 
@@ -5334,9 +5447,18 @@ static void igb_tx_ctxtdesc(struct igb_r
                mss_l4len_idx |= tx_ring->reg_idx << 4;
 
        context_desc->vlan_macip_lens   = cpu_to_le32(vlan_macip_lens);
-       context_desc->seqnum_seed       = 0;
        context_desc->type_tucmd_mlhl   = cpu_to_le32(type_tucmd);
        context_desc->mss_l4len_idx     = cpu_to_le32(mss_l4len_idx);
+
+       /* We assume there is always a valid tx time available. Invalid times
+        * should have been handled by the upper layers.
+        */
+       if (tx_ring->launchtime_enable) {
+               ts = ns_to_timespec64(first->skb->tstamp);
+               context_desc->seqnum_seed = cpu_to_le32(ts.tv_nsec / 32);
+       } else {
+               context_desc->seqnum_seed = 0;
+       }
 }
 
 static int igb_tso(struct igb_ring *tx_ring,
@@ -5419,7 +5541,8 @@ static int igb_tso(struct igb_ring *tx_r
        vlan_macip_lens |= (ip.hdr - skb->data) << E1000_ADVTXD_MACLEN_SHIFT;
        vlan_macip_lens |= first->tx_flags & IGB_TX_FLAGS_VLAN_MASK;
 
-       igb_tx_ctxtdesc(tx_ring, vlan_macip_lens, type_tucmd, mss_l4len_idx);
+       igb_tx_ctxtdesc(tx_ring, first, vlan_macip_lens,
+                       type_tucmd, mss_l4len_idx);
 
        return 1;
 }
@@ -5474,7 +5597,7 @@ no_csum:
        vlan_macip_lens |= skb_network_offset(skb) << E1000_ADVTXD_MACLEN_SHIFT;
        vlan_macip_lens |= first->tx_flags & IGB_TX_FLAGS_VLAN_MASK;
 
-       igb_tx_ctxtdesc(tx_ring, vlan_macip_lens, type_tucmd, 0);
+       igb_tx_ctxtdesc(tx_ring, first, vlan_macip_lens, type_tucmd, 0);
 }
 
 #define IGB_SET_FLAG(_input, _flag, _result) \
Index: linux-4.16.12-rt5/include/linux/netdevice.h
===================================================================
--- linux-4.16.12-rt5.orig/include/linux/netdevice.h
+++ linux-4.16.12-rt5/include/linux/netdevice.h
@@ -781,6 +781,7 @@ enum tc_setup_type {
        TC_SETUP_QDISC_CBS,
        TC_SETUP_QDISC_RED,
        TC_SETUP_QDISC_PRIO,
+       TC_SETUP_QDISC_TBS,
 };
 
 /* These structures hold the attributes of bpf state that are being passed
@@ -3373,6 +3374,7 @@ static __always_inline int ____dev_forwa
 
        skb_scrub_packet(skb, true);
        skb->priority = 0;
+       skb->tstamp = 0;
        return 0;
 }
 
Index: linux-4.16.12-rt5/include/linux/posix-timers.h
===================================================================
--- linux-4.16.12-rt5.orig/include/linux/posix-timers.h
+++ linux-4.16.12-rt5/include/linux/posix-timers.h
@@ -28,6 +28,7 @@ struct cpu_timer_list {
  *
  * A clockid is invalid if bits 2, 1, and 0 are all set.
  */
+#define CLOCKID_INVALID                        GENMASK(2, 0)
 #define CPUCLOCK_PID(clock)            ((pid_t) ~((clock) >> 3))
 #define CPUCLOCK_PERTHREAD(clock) \
        (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0)
Index: linux-4.16.12-rt5/include/linux/skbuff.h
===================================================================
--- linux-4.16.12-rt5.orig/include/linux/skbuff.h
+++ linux-4.16.12-rt5/include/linux/skbuff.h
@@ -785,6 +785,9 @@ struct sk_buff {
        __u8                    tc_redirected:1;
        __u8                    tc_from_ingress:1;
 #endif
+       __u8                    tc_drop_if_late:1;
+
+       clockid_t               txtime_clockid;
 
 #ifdef CONFIG_NET_SCHED
        __u16                   tc_index;       /* traffic control index */
Index: linux-4.16.12-rt5/include/net/pkt_sched.h
===================================================================
--- linux-4.16.12-rt5.orig/include/net/pkt_sched.h
+++ linux-4.16.12-rt5/include/net/pkt_sched.h
@@ -72,6 +72,8 @@ struct qdisc_watchdog {
        struct Qdisc    *qdisc;
 };
 
+void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
+                                clockid_t clockid);
 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc);
 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires);
 
@@ -153,4 +155,9 @@ struct tc_cbs_qopt_offload {
        s32 sendslope;
 };
 
+struct tc_tbs_qopt_offload {
+       u8 enable;
+       s32 queue;
+};
+
 #endif
Index: linux-4.16.12-rt5/include/net/sock.h
===================================================================
--- linux-4.16.12-rt5.orig/include/net/sock.h
+++ linux-4.16.12-rt5/include/net/sock.h
@@ -777,6 +777,7 @@ enum sock_flags {
        SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */
        SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
        SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */
+       SOCK_TXTIME,
 };
 
 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
@@ -1568,8 +1569,11 @@ void sock_kzfree_s(struct sock *sk, void
 void sk_send_sigurg(struct sock *sk);
 
 struct sockcm_cookie {
+       u64 transmit_time;
        u32 mark;
+       clockid_t clockid;
        u16 tsflags;
+       u8 drop_if_late;
 };
 
 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
Index: linux-4.16.12-rt5/include/uapi/asm-generic/socket.h
===================================================================
--- linux-4.16.12-rt5.orig/include/uapi/asm-generic/socket.h
+++ linux-4.16.12-rt5/include/uapi/asm-generic/socket.h
@@ -107,4 +107,9 @@
 
 #define SO_ZEROCOPY            60
 
+#define SO_TXTIME              61
+#define SCM_TXTIME             SO_TXTIME
+#define SCM_DROP_IF_LATE       62
+#define SCM_CLOCKID            63
+
 #endif /* __ASM_GENERIC_SOCKET_H */
Index: linux-4.16.12-rt5/include/uapi/linux/pkt_sched.h
===================================================================
--- linux-4.16.12-rt5.orig/include/uapi/linux/pkt_sched.h
+++ linux-4.16.12-rt5/include/uapi/linux/pkt_sched.h
@@ -934,4 +934,22 @@ enum {
 
 #define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
 
+
+/* TBS */
+struct tc_tbs_qopt {
+       __s32 delta;
+       __s32 clockid;
+       __u32 flags;
+#define TC_TBS_SORTING_ON BIT(0)
+#define TC_TBS_OFFLOAD_ON BIT(1)
+};
+
+enum {
+       TCA_TBS_UNSPEC,
+       TCA_TBS_PARMS,
+       __TCA_TBS_MAX,
+};
+
+#define TCA_TBS_MAX (__TCA_TBS_MAX - 1)
+
 #endif
Index: linux-4.16.12-rt5/net/core/skbuff.c
===================================================================
--- linux-4.16.12-rt5.orig/net/core/skbuff.c
+++ linux-4.16.12-rt5/net/core/skbuff.c
@@ -4864,7 +4864,6 @@ EXPORT_SYMBOL(skb_try_coalesce);
  */
 void skb_scrub_packet(struct sk_buff *skb, bool xnet)
 {
-       skb->tstamp = 0;
        skb->pkt_type = PACKET_HOST;
        skb->skb_iif = 0;
        skb->ignore_df = 0;
Index: linux-4.16.12-rt5/net/core/sock.c
===================================================================
--- linux-4.16.12-rt5.orig/net/core/sock.c
+++ linux-4.16.12-rt5/net/core/sock.c
@@ -91,6 +91,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <asm/unaligned.h>
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/errqueue.h>
@@ -1061,6 +1062,15 @@ set_rcvbuf:
                        sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
                break;
 
+       case SO_TXTIME:
+               if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+                       ret = -EPERM;
+               else if (val < 0 || val > 1)
+                       ret = -EINVAL;
+               else
+                       sock_valbool_flag(sk, SOCK_TXTIME, valbool);
+               break;
+
        default:
                ret = -ENOPROTOOPT;
                break;
@@ -1393,6 +1403,10 @@ int sock_getsockopt(struct socket *sock,
                v.val = sock_flag(sk, SOCK_ZEROCOPY);
                break;
 
+       case SO_TXTIME:
+               v.val = sock_flag(sk, SOCK_TXTIME);
+               break;
+
        default:
                /* We implement the SO_SNDLOWAT etc to not be settable
                 * (1003.1g 7).
@@ -2107,6 +2121,7 @@ int __sock_cmsg_send(struct sock *sk, st
                     struct sockcm_cookie *sockc)
 {
        u32 tsflags;
+       u8 drop;
 
        switch (cmsg->cmsg_type) {
        case SO_MARK:
@@ -2127,6 +2142,32 @@ int __sock_cmsg_send(struct sock *sk, st
                sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
                sockc->tsflags |= tsflags;
                break;
+       case SCM_TXTIME:
+               if (!sock_flag(sk, SOCK_TXTIME))
+                       return -EINVAL;
+               if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
+                       return -EINVAL;
+               sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
+               break;
+       case SCM_DROP_IF_LATE:
+               if (!sock_flag(sk, SOCK_TXTIME))
+                       return -EINVAL;
+               if (cmsg->cmsg_len != CMSG_LEN(sizeof(u8)))
+                       return -EINVAL;
+
+               drop = get_unaligned((u8 *)CMSG_DATA(cmsg));
+               if (drop < 0 || drop > 1)
+                       return -EINVAL;
+
+               sockc->drop_if_late = drop;
+               break;
+       case SCM_CLOCKID:
+               if (!sock_flag(sk, SOCK_TXTIME))
+                       return -EINVAL;
+               if (cmsg->cmsg_len != CMSG_LEN(sizeof(clockid_t)))
+                       return -EINVAL;
+               sockc->clockid = get_unaligned((clockid_t *)CMSG_DATA(cmsg));
+               break;
        /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
        case SCM_RIGHTS:
        case SCM_CREDENTIALS:
Index: linux-4.16.12-rt5/net/ipv4/raw.c
===================================================================
--- linux-4.16.12-rt5.orig/net/ipv4/raw.c
+++ linux-4.16.12-rt5/net/ipv4/raw.c
@@ -79,6 +79,7 @@
 #include <linux/netfilter_ipv4.h>
 #include <linux/compat.h>
 #include <linux/uio.h>
+#include <linux/posix-timers.h>
 
 struct raw_frag_vec {
        struct msghdr *msg;
@@ -381,6 +382,9 @@ static int raw_send_hdrinc(struct sock *
 
        skb->priority = sk->sk_priority;
        skb->mark = sk->sk_mark;
+       skb->tstamp = sockc->transmit_time;
+       skb->txtime_clockid = sockc->clockid;
+       skb->tc_drop_if_late = sockc->drop_if_late;
        skb_dst_set(skb, &rt->dst);
        *rtp = NULL;
 
@@ -562,6 +566,9 @@ static int raw_sendmsg(struct sock *sk,
        }
 
        ipc.sockc.tsflags = sk->sk_tsflags;
+       ipc.sockc.transmit_time = 0;
+       ipc.sockc.drop_if_late = 0;
+       ipc.sockc.clockid = CLOCKID_INVALID;
        ipc.addr = inet->inet_saddr;
        ipc.opt = NULL;
        ipc.tx_flags = 0;
Index: linux-4.16.12-rt5/net/ipv4/udp.c
===================================================================
--- linux-4.16.12-rt5.orig/net/ipv4/udp.c
+++ linux-4.16.12-rt5/net/ipv4/udp.c
@@ -115,6 +115,7 @@
 #include "udp_impl.h"
 #include <net/sock_reuseport.h>
 #include <net/addrconf.h>
+#include <linux/posix-timers.h>
 
 struct udp_table udp_table __read_mostly;
 EXPORT_SYMBOL(udp_table);
@@ -926,6 +927,9 @@ int udp_sendmsg(struct sock *sk, struct
        }
 
        ipc.sockc.tsflags = sk->sk_tsflags;
+       ipc.sockc.transmit_time = 0;
+       ipc.sockc.drop_if_late = 0;
+       ipc.sockc.clockid = CLOCKID_INVALID;
        ipc.addr = inet->inet_saddr;
        ipc.oif = sk->sk_bound_dev_if;
 
@@ -1042,8 +1046,12 @@ back_from_confirm:
                                  sizeof(struct udphdr), &ipc, &rt,
                                  msg->msg_flags);
                err = PTR_ERR(skb);
-               if (!IS_ERR_OR_NULL(skb))
+               if (!IS_ERR_OR_NULL(skb)) {
+                       skb->tstamp = ipc.sockc.transmit_time;
+                       skb->txtime_clockid = ipc.sockc.clockid;
+                       skb->tc_drop_if_late = ipc.sockc.drop_if_late;
                        err = udp_send_skb(skb, fl4);
+               }
                goto out;
        }
 
Index: linux-4.16.12-rt5/net/packet/af_packet.c
===================================================================
--- linux-4.16.12-rt5.orig/net/packet/af_packet.c
+++ linux-4.16.12-rt5/net/packet/af_packet.c
@@ -94,6 +94,7 @@
 #endif
 #include <linux/bpf.h>
 #include <net/compat.h>
+#include <linux/posix-timers.h>
 
 #include "internal.h"
 
@@ -1983,6 +1984,9 @@ retry:
                goto out_unlock;
        }
 
+       sockc.transmit_time = 0;
+       sockc.drop_if_late = 0;
+       sockc.clockid = CLOCKID_INVALID;
        sockc.tsflags = sk->sk_tsflags;
        if (msg->msg_controllen) {
                err = sock_cmsg_send(sk, msg, &sockc);
@@ -1994,6 +1998,9 @@ retry:
        skb->dev = dev;
        skb->priority = sk->sk_priority;
        skb->mark = sk->sk_mark;
+       skb->tstamp = sockc.transmit_time;
+       skb->tc_drop_if_late = sockc.drop_if_late;
+       skb->txtime_clockid = sockc.clockid;
 
        sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
 
@@ -2491,6 +2498,9 @@ static int tpacket_fill_skb(struct packe
        skb->dev = dev;
        skb->priority = po->sk.sk_priority;
        skb->mark = po->sk.sk_mark;
+       skb->tstamp = sockc->transmit_time;
+       skb->tc_drop_if_late = sockc->drop_if_late;
+       skb->txtime_clockid = sockc->clockid;
        sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
        skb_shinfo(skb)->destructor_arg = ph.raw;
 
@@ -2667,6 +2677,9 @@ static int tpacket_snd(struct packet_soc
        if (unlikely(!(dev->flags & IFF_UP)))
                goto out_put;
 
+       sockc.transmit_time = 0;
+       sockc.drop_if_late = 0;
+       sockc.clockid = CLOCKID_INVALID;
        sockc.tsflags = po->sk.sk_tsflags;
        if (msg->msg_controllen) {
                err = sock_cmsg_send(&po->sk, msg, &sockc);
@@ -2863,6 +2876,9 @@ static int packet_snd(struct socket *soc
        if (unlikely(!(dev->flags & IFF_UP)))
                goto out_unlock;
 
+       sockc.transmit_time = 0;
+       sockc.drop_if_late = 0;
+       sockc.clockid = CLOCKID_INVALID;
        sockc.tsflags = sk->sk_tsflags;
        sockc.mark = sk->sk_mark;
        if (msg->msg_controllen) {
@@ -2937,6 +2953,9 @@ static int packet_snd(struct socket *soc
        skb->dev = dev;
        skb->priority = sk->sk_priority;
        skb->mark = sockc.mark;
+       skb->tstamp = sockc.transmit_time;
+       skb->tc_drop_if_late = sockc.drop_if_late;
+       skb->txtime_clockid = sockc.clockid;
 
        if (has_vnet_hdr) {
                err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
Index: linux-4.16.12-rt5/net/sched/Kconfig
===================================================================
--- linux-4.16.12-rt5.orig/net/sched/Kconfig
+++ linux-4.16.12-rt5/net/sched/Kconfig
@@ -183,6 +183,17 @@ config NET_SCH_CBS
          To compile this code as a module, choose M here: the
          module will be called sch_cbs.
 
+config NET_SCH_TBS
+       tristate "Time Based Scheduler (TBS)"
+       ---help---
+         Say Y here if you want to use the Time Based Scheduler (TBS) packet
+         scheduling algorithm.
+
+         See the top of <file:net/sched/sch_tbs.c> for more details.
+
+         To compile this code as a module, choose M here: the
+         module will be called sch_tbs.
+
 config NET_SCH_GRED
        tristate "Generic Random Early Detection (GRED)"
        ---help---
Index: linux-4.16.12-rt5/net/sched/Makefile
===================================================================
--- linux-4.16.12-rt5.orig/net/sched/Makefile
+++ linux-4.16.12-rt5/net/sched/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_NET_SCH_FQ)      += sch_fq.o
 obj-$(CONFIG_NET_SCH_HHF)      += sch_hhf.o
 obj-$(CONFIG_NET_SCH_PIE)      += sch_pie.o
 obj-$(CONFIG_NET_SCH_CBS)      += sch_cbs.o
+obj-$(CONFIG_NET_SCH_TBS)      += sch_tbs.o
 
 obj-$(CONFIG_NET_CLS_U32)      += cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)   += cls_route.o
Index: linux-4.16.12-rt5/net/sched/sch_api.c
===================================================================
--- linux-4.16.12-rt5.orig/net/sched/sch_api.c
+++ linux-4.16.12-rt5/net/sched/sch_api.c
@@ -596,12 +596,19 @@ static enum hrtimer_restart qdisc_watchd
        return HRTIMER_NORESTART;
 }
 
-void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
+void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
+                                clockid_t clockid)
 {
-       hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+       hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
        wd->timer.function = qdisc_watchdog;
        wd->qdisc = qdisc;
 }
+EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
+
+void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
+{
+       qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
+}
 EXPORT_SYMBOL(qdisc_watchdog_init);
 
 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
Index: linux-4.16.12-rt5/net/sched/sch_tbs.c
===================================================================
--- /dev/null
+++ linux-4.16.12-rt5/net/sched/sch_tbs.c
@@ -0,0 +1,591 @@
+/*
+ * net/sched/sch_tbs.c Time Based Shaper
+ *
+ *             This program is free software; you can redistribute it and/or
+ *             modify it under the terms of the GNU General Public License
+ *             as published by the Free Software Foundation; either version
+ *             2 of the License, or (at your option) any later version.
+ *
+ * Authors:    Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
+ *             Vinicius Costa Gomes <vinicius.gomes@intel.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/rbtree.h>
+#include <linux/skbuff.h>
+#include <linux/posix-timers.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+
+#define SORTING_IS_ON(x) (x->flags & TC_TBS_SORTING_ON)
+#define OFFLOAD_IS_ON(x) (x->flags & TC_TBS_OFFLOAD_ON)
+
+struct tbs_sched_data {
+       bool offload;
+       bool sorting;
+       int clockid;
+       int queue;
+       s32 delta; /* in ns */
+       ktime_t last; /* The txtime of the last skb sent to the netdevice. */
+       struct rb_root head;
+       struct qdisc_watchdog watchdog;
+       struct Qdisc *qdisc;
+       int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch,
+                      struct sk_buff **to_free);
+       struct sk_buff *(*dequeue)(struct Qdisc *sch);
+       struct sk_buff *(*peek)(struct Qdisc *sch);
+};
+
+static const struct nla_policy tbs_policy[TCA_TBS_MAX + 1] = {
+       [TCA_TBS_PARMS] = { .len = sizeof(struct tc_tbs_qopt) },
+};
+
+typedef ktime_t (*get_time_func_t)(void);
+
+static const get_time_func_t clockid_to_get_time[MAX_CLOCKS] = {
+       [CLOCK_MONOTONIC] = ktime_get,
+       [CLOCK_REALTIME] = ktime_get_real,
+       [CLOCK_BOOTTIME] = ktime_get_boottime,
+       [CLOCK_TAI] = ktime_get_clocktai,
+};
+
+static ktime_t get_time_by_clockid(clockid_t clockid)
+{
+       get_time_func_t func = clockid_to_get_time[clockid];
+
+       if (!func)
+               return 0;
+
+       return func();
+}
+
+static inline int validate_input_params(struct tc_tbs_qopt *qopt,
+                                       struct netlink_ext_ack *extack)
+{
+       /* Check if params comply to the following rules:
+        *      * If SW best-effort, then clockid and delta must be valid.
+        *
+        *      * If HW offload is ON and sorting is ON, then clockid and delta
+        *        must be valid.
+        *
+        *      * If HW offload is ON and sorting is OFF, then clockid and
+        *        delta must not have been set. The netdevice PHC will be used
+        *        implictly.
+        *
+        *      * Dynamic clockids are not supported.
+        *      * Delta must be a positive integer.
+        */
+       if (!OFFLOAD_IS_ON(qopt) || SORTING_IS_ON(qopt)) {
+               if ((qopt->clockid & CLOCKID_INVALID) == CLOCKID_INVALID ||
+                   qopt->clockid >= MAX_CLOCKS) {
+                       NL_SET_ERR_MSG(extack, "Invalid clockid");
+                       return -EINVAL;
+               } else if (qopt->clockid < 0 ||
+                          !clockid_to_get_time[qopt->clockid]) {
+                       NL_SET_ERR_MSG(extack, "Clockid is not supported");
+                       return -ENOTSUPP;
+               }
+
+               if (qopt->delta < 0) {
+                       NL_SET_ERR_MSG(extack, "Delta must be positive");
+                       return -EINVAL;
+               }
+       } else {
+               if (qopt->delta != 0) {
+                       NL_SET_ERR_MSG(extack, "Cannot set delta for this mode");
+                       return -EINVAL;
+               }
+               if ((qopt->clockid & CLOCKID_INVALID) != CLOCKID_INVALID) {
+                       NL_SET_ERR_MSG(extack, "Cannot set clockid for this mode");
+                       return -EINVAL;
+               }
+       }
+
+       return 0;
+}
+
+static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
+{
+       struct tbs_sched_data *q = qdisc_priv(sch);
+       ktime_t txtime = nskb->tstamp;
+       struct sock *sk = nskb->sk;
+       ktime_t now;
+
+       if (sk && !sock_flag(sk, SOCK_TXTIME))
+               return false;
+
+       /* We don't perform crosstimestamping.
+        * Drop if packet's clockid differs from qdisc's.
+        */
+       if (nskb->txtime_clockid != q->clockid)
+               return false;
+
+       now = get_time_by_clockid(q->clockid);
+       if (ktime_before(txtime, now) || ktime_before(txtime, q->last))
+               return false;
+
+       return true;
+}
+
+static struct sk_buff *tbs_peek(struct Qdisc *sch)
+{
+       struct tbs_sched_data *q = qdisc_priv(sch);
+
+       return q->peek(sch);
+}
+
+static struct sk_buff *tbs_peek_timesortedlist(struct Qdisc *sch)
+{
+       struct tbs_sched_data *q = qdisc_priv(sch);
+       struct rb_node *p;
+
+       p = rb_first(&q->head);
+       if (!p)
+               return NULL;
+
+       return rb_to_skb(p);
+}
+
+static void reset_watchdog(struct Qdisc *sch)
+{
+       struct tbs_sched_data *q = qdisc_priv(sch);
+       struct sk_buff *skb = tbs_peek(sch);
+       ktime_t next;
+
+       if (!skb)
+               return;
+
+       next = ktime_sub_ns(skb->tstamp, q->delta);
+       qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next));
+}
+
+static int tbs_enqueue(struct sk_buff *nskb, struct Qdisc *sch,
+                      struct sk_buff **to_free)
+{
+       struct tbs_sched_data *q = qdisc_priv(sch);
+
+       return q->enqueue(nskb, sch, to_free);
+}
+
+static int tbs_enqueue_fifo(struct sk_buff *nskb, struct Qdisc *sch,
+                           struct sk_buff **to_free)
+{
+       if (!is_packet_valid(sch, nskb))
+               return qdisc_drop(nskb, sch, to_free);
+
+       return qdisc_enqueue_tail(nskb, sch);
+}
+
+static int tbs_enqueue_scheduledfifo(struct sk_buff *nskb, struct Qdisc *sch,
+                                    struct sk_buff **to_free)
+{
+       int err;
+
+       if (!is_packet_valid(sch, nskb))
+               return qdisc_drop(nskb, sch, to_free);
+
+       err = qdisc_enqueue_tail(nskb, sch);
+
+       /* If there is only 1 packet, then we must reset the watchdog. */
+       if (err >= 0 && sch->q.qlen == 1)
+               reset_watchdog(sch);
+
+       return err;
+}
+
+static int tbs_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
+                                     struct sk_buff **to_free)
+{
+       struct tbs_sched_data *q = qdisc_priv(sch);
+       struct rb_node **p = &q->head.rb_node, *parent = NULL;
+       ktime_t txtime = nskb->tstamp;
+
+       if (!is_packet_valid(sch, nskb))
+               return qdisc_drop(nskb, sch, to_free);
+
+       while (*p) {
+               struct sk_buff *skb;
+
+               parent = *p;
+               skb = rb_to_skb(parent);
+               if (ktime_after(txtime, skb->tstamp))
+                       p = &parent->rb_right;
+               else
+                       p = &parent->rb_left;
+       }
+       rb_link_node(&nskb->rbnode, parent, p);
+       rb_insert_color(&nskb->rbnode, &q->head);
+
+       qdisc_qstats_backlog_inc(sch, nskb);
+       sch->q.qlen++;
+
+       /* Now we may need to re-arm the qdisc watchdog for the next packet. */
+       reset_watchdog(sch);
+
+       return NET_XMIT_SUCCESS;
+}
+
+static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb,
+                                bool drop)
+{
+       struct tbs_sched_data *q = qdisc_priv(sch);
+
+       rb_erase(&skb->rbnode, &q->head);
+
+       qdisc_qstats_backlog_dec(sch, skb);
+
+       if (drop) {
+               struct sk_buff *to_free = NULL;
+
+               qdisc_drop(skb, sch, &to_free);
+               kfree_skb_list(to_free);
+               qdisc_qstats_overlimit(sch);
+       } else {
+               qdisc_bstats_update(sch, skb);
+
+               q->last = skb->tstamp;
+       }
+
+       sch->q.qlen--;
+
+       /* The rbnode field in the skb re-uses these fields, now that
+        * we are done with the rbnode, reset them.
+        */
+       skb->next = NULL;
+       skb->prev = NULL;
+       skb->dev = qdisc_dev(sch);
+}
+
+static struct sk_buff *tbs_dequeue(struct Qdisc *sch)
+{
+       struct tbs_sched_data *q = qdisc_priv(sch);
+
+       return q->dequeue(sch);
+}
+
+static struct sk_buff *tbs_dequeue_fifo(struct Qdisc *sch)
+{
+       struct tbs_sched_data *q = qdisc_priv(sch);
+       struct sk_buff *skb = qdisc_dequeue_head(sch);
+
+       /* XXX: The drop_if_late bit is not checked here because that would
+        *      require the PHC time to be read directly.
+        */
+
+       if (skb)
+               q->last = skb->tstamp;
+
+       return skb;
+}
+
+static struct sk_buff *tbs_dequeue_scheduledfifo(struct Qdisc *sch)
+{
+       struct tbs_sched_data *q = qdisc_priv(sch);
+       struct sk_buff *skb = tbs_peek(sch);
+       ktime_t now, next;
+
+       if (!skb)
+               return NULL;
+
+       now = get_time_by_clockid(q->clockid);
+
+       /* Drop if packet has expired while in queue and the drop_if_late
+        * flag is set.
+        */
+       if (skb->tc_drop_if_late && ktime_before(skb->tstamp, now)) {
+               struct sk_buff *to_free = NULL;
+
+               qdisc_queue_drop_head(sch, &to_free);
+               kfree_skb_list(to_free);
+               qdisc_qstats_overlimit(sch);
+
+               skb = NULL;
+               goto out;
+       }
+
+       next = ktime_sub_ns(skb->tstamp, q->delta);
+
+       /* Dequeue only if now is within the [txtime - delta, txtime] range. */
+       if (ktime_after(now, next))
+               skb = qdisc_dequeue_head(sch);
+       else
+               skb = NULL;
+
+out:
+       /* Now we may need to re-arm the qdisc watchdog for the next packet. */
+       reset_watchdog(sch);
+
+       return skb;
+}
+
+static struct sk_buff *tbs_dequeue_timesortedlist(struct Qdisc *sch)
+{
+       struct tbs_sched_data *q = qdisc_priv(sch);
+       struct sk_buff *skb;
+       ktime_t now, next;
+
+       skb = tbs_peek(sch);
+       if (!skb)
+               return NULL;
+
+       now = get_time_by_clockid(q->clockid);
+
+       /* Drop if packet has expired while in queue and the drop_if_late
+        * flag is set.
+        */
+       if (skb->tc_drop_if_late && ktime_before(skb->tstamp, now)) {
+               timesortedlist_erase(sch, skb, true);
+               skb = NULL;
+               goto out;
+       }
+
+       next = ktime_sub_ns(skb->tstamp, q->delta);
+
+       /* Dequeue only if now is within the [txtime - delta, txtime] range. */
+       if (ktime_after(now, next))
+               timesortedlist_erase(sch, skb, false);
+       else
+               skb = NULL;
+
+out:
+       /* Now we may need to re-arm the qdisc watchdog for the next packet. */
+       reset_watchdog(sch);
+
+       return skb;
+}
+
+static void tbs_disable_offload(struct net_device *dev,
+                               struct tbs_sched_data *q)
+{
+       struct tc_tbs_qopt_offload tbs = { };
+       const struct net_device_ops *ops;
+       int err;
+
+       if (!q->offload)
+               return;
+
+       ops = dev->netdev_ops;
+       if (!ops->ndo_setup_tc)
+               return;
+
+       tbs.queue = q->queue;
+       tbs.enable = 0;
+
+       err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBS, &tbs);
+       if (err < 0)
+               pr_warn("Couldn't disable TBS offload for queue %d\n",
+                       tbs.queue);
+}
+
+static int tbs_enable_offload(struct net_device *dev, struct tbs_sched_data *q,
+                             struct netlink_ext_ack *extack)
+{
+       const struct net_device_ops *ops = dev->netdev_ops;
+       struct tc_tbs_qopt_offload tbs = { };
+       int err;
+
+       if (q->offload)
+               return 0;
+
+       if (!ops->ndo_setup_tc) {
+               NL_SET_ERR_MSG(extack, "Specified device does not support TBS offload");
+               return -EOPNOTSUPP;
+       }
+
+       tbs.queue = q->queue;
+       tbs.enable = 1;
+
+       err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBS, &tbs);
+       if (err < 0) {
+               NL_SET_ERR_MSG(extack, "Specified device failed to setup TBS hardware offload");
+               return err;
+       }
+
+       return 0;
+}
+
+static inline void setup_queueing_mode(struct tbs_sched_data *q)
+{
+       if (q->sorting) {
+               q->enqueue = tbs_enqueue_timesortedlist;
+               q->dequeue = tbs_dequeue_timesortedlist;
+               q->peek = tbs_peek_timesortedlist;
+       } else {
+               if (q->offload) {
+                       q->enqueue = tbs_enqueue_fifo;
+                       q->dequeue = tbs_dequeue_fifo;
+                       q->peek = qdisc_peek_head;
+               } else {
+                       q->enqueue = tbs_enqueue_scheduledfifo;
+                       q->dequeue = tbs_dequeue_scheduledfifo;
+                       q->peek = qdisc_peek_head;
+               }
+       }
+}
+
+static int tbs_init(struct Qdisc *sch, struct nlattr *opt,
+                   struct netlink_ext_ack *extack)
+{
+       struct tbs_sched_data *q = qdisc_priv(sch);
+       struct net_device *dev = qdisc_dev(sch);
+       struct nlattr *tb[TCA_TBS_MAX + 1];
+       struct tc_tbs_qopt *qopt;
+       int err;
+
+       if (!opt) {
+               NL_SET_ERR_MSG(extack, "Missing TBS qdisc options which are mandatory");
+               return -EINVAL;
+       }
+
+       err = nla_parse_nested(tb, TCA_TBS_MAX, opt, tbs_policy, extack);
+       if (err < 0)
+               return err;
+
+       if (!tb[TCA_TBS_PARMS]) {
+               NL_SET_ERR_MSG(extack, "Missing mandatory TBS parameters");
+               return -EINVAL;
+       }
+
+       qopt = nla_data(tb[TCA_TBS_PARMS]);
+
+       pr_debug("delta %d clockid %d offload %s sorting %s\n",
+                qopt->delta, qopt->clockid,
+                OFFLOAD_IS_ON(qopt) ? "on" : "off",
+                SORTING_IS_ON(qopt) ? "on" : "off");
+
+       err = validate_input_params(qopt, extack);
+       if (err < 0)
+               return err;
+
+       q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
+
+       if (OFFLOAD_IS_ON(qopt)) {
+               err = tbs_enable_offload(dev, q, extack);
+               if (err < 0)
+                       return err;
+       }
+
+       /* Everything went OK, save the parameters used. */
+       q->delta = qopt->delta;
+       q->clockid = qopt->clockid;
+       q->offload = OFFLOAD_IS_ON(qopt);
+       q->sorting = SORTING_IS_ON(qopt);
+
+       /* Select queueing mode based on offload and sorting parameters. */
+       setup_queueing_mode(q);
+
+       /* The watchdog will be needed for SW best-effort or if TxTime
+        * based sorting is on.
+        */
+       if (!q->offload || q->sorting)
+               qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid);
+
+       return 0;
+}
+
+static void timesortedlist_clear(struct Qdisc *sch)
+{
+       struct tbs_sched_data *q = qdisc_priv(sch);
+       struct rb_node *p = rb_first(&q->head);
+
+       while (p) {
+               struct sk_buff *skb = rb_to_skb(p);
+
+               p = rb_next(p);
+
+               rb_erase(&skb->rbnode, &q->head);
+               rtnl_kfree_skbs(skb, skb);
+               sch->q.qlen--;
+       }
+}
+
+static void tbs_reset(struct Qdisc *sch)
+{
+       struct tbs_sched_data *q = qdisc_priv(sch);
+
+       /* Only cancel watchdog if it's been initialized. */
+       if (q->watchdog.qdisc == sch)
+               qdisc_watchdog_cancel(&q->watchdog);
+
+       /* No matter which mode we are on, it's safe to clear both lists. */
+       timesortedlist_clear(sch);
+       __qdisc_reset_queue(&sch->q);
+
+       sch->qstats.backlog = 0;
+       sch->q.qlen = 0;
+
+       q->last = 0;
+}
+
+static void tbs_destroy(struct Qdisc *sch)
+{
+       struct tbs_sched_data *q = qdisc_priv(sch);
+       struct net_device *dev = qdisc_dev(sch);
+
+       /* Only cancel watchdog if it's been initialized. */
+       if (q->watchdog.qdisc == sch)
+               qdisc_watchdog_cancel(&q->watchdog);
+
+       tbs_disable_offload(dev, q);
+}
+
+static int tbs_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+       struct tbs_sched_data *q = qdisc_priv(sch);
+       struct tc_tbs_qopt opt = { };
+       struct nlattr *nest;
+
+       nest = nla_nest_start(skb, TCA_OPTIONS);
+       if (!nest)
+               goto nla_put_failure;
+
+       opt.delta = q->delta;
+       opt.clockid = q->clockid;
+       if (q->offload)
+               opt.flags |= TC_TBS_OFFLOAD_ON;
+
+       if (q->sorting)
+               opt.flags |= TC_TBS_SORTING_ON;
+
+       if (nla_put(skb, TCA_TBS_PARMS, sizeof(opt), &opt))
+               goto nla_put_failure;
+
+       return nla_nest_end(skb, nest);
+
+nla_put_failure:
+       nla_nest_cancel(skb, nest);
+       return -1;
+}
+
+static struct Qdisc_ops tbs_qdisc_ops __read_mostly = {
+       .id             =       "tbs",
+       .priv_size      =       sizeof(struct tbs_sched_data),
+       .enqueue        =       tbs_enqueue,
+       .dequeue        =       tbs_dequeue,
+       .peek           =       tbs_peek,
+       .init           =       tbs_init,
+       .reset          =       tbs_reset,
+       .destroy        =       tbs_destroy,
+       .dump           =       tbs_dump,
+       .owner          =       THIS_MODULE,
+};
+
+static int __init tbs_module_init(void)
+{
+       return register_qdisc(&tbs_qdisc_ops);
+}
+
+static void __exit tbs_module_exit(void)
+{
+       unregister_qdisc(&tbs_qdisc_ops);
+}
+module_init(tbs_module_init)
+module_exit(tbs_module_exit)
+MODULE_LICENSE("GPL");