--- Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html | 8 Documentation/RCU/Design/Requirements/Requirements.html | 24 Documentation/RCU/checklist.txt | 4 Documentation/RCU/rcubarrier.txt | 8 Documentation/RCU/stallwarn.txt | 4 Documentation/RCU/whatisRCU.txt | 7 Documentation/admin-guide/sysctl/vm.rst | 3 Documentation/printk-ringbuffer.txt | 377 + Documentation/trace/ftrace-uses.rst | 2 arch/Kconfig | 1 arch/alpha/include/asm/spinlock_types.h | 4 arch/arc/kernel/entry.S | 6 arch/arm/Kconfig | 4 arch/arm/include/asm/irq.h | 2 arch/arm/include/asm/spinlock_types.h | 4 arch/arm/include/asm/switch_to.h | 10 arch/arm/include/asm/thread_info.h | 8 arch/arm/kernel/asm-offsets.c | 1 arch/arm/kernel/entry-armv.S | 23 arch/arm/kernel/entry-common.S | 9 arch/arm/kernel/signal.c | 3 arch/arm/kernel/smp.c | 2 arch/arm/kernel/traps.c | 2 arch/arm/mm/cache-v7.S | 4 arch/arm/mm/cache-v7m.S | 4 arch/arm/mm/fault.c | 6 arch/arm/mm/highmem.c | 58 arch/arm64/Kconfig | 54 arch/arm64/crypto/sha256-glue.c | 2 arch/arm64/include/asm/assembler.h | 6 arch/arm64/include/asm/kvm_mmu.h | 1 arch/arm64/include/asm/preempt.h | 29 arch/arm64/include/asm/spinlock_types.h | 4 arch/arm64/include/asm/thread_info.h | 6 arch/arm64/kernel/asm-offsets.c | 1 arch/arm64/kernel/entry.S | 15 arch/arm64/kernel/fpsimd.c | 14 arch/arm64/kernel/signal.c | 2 arch/arm64/kernel/smp.c | 4 arch/arm64/kernel/traps.c | 3 arch/arm64/kvm/va_layout.c | 8 arch/c6x/kernel/entry.S | 8 arch/csky/kernel/entry.S | 4 arch/h8300/kernel/entry.S | 6 arch/hexagon/include/asm/spinlock_types.h | 4 arch/hexagon/kernel/vm_entry.S | 6 arch/ia64/include/asm/spinlock_types.h | 4 arch/ia64/kernel/entry.S | 12 arch/ia64/kernel/kprobes.c | 2 arch/m68k/coldfire/entry.S | 2 arch/microblaze/kernel/entry.S | 2 arch/mips/Kconfig | 2 arch/mips/include/asm/asmmacro.h | 4 arch/mips/kernel/entry.S | 6 arch/nds32/Kconfig | 2 arch/nds32/kernel/ex-exit.S | 4 arch/nios2/kernel/entry.S | 2 arch/parisc/Kconfig | 2 arch/parisc/kernel/entry.S | 10 arch/powerpc/Kconfig | 6 arch/powerpc/include/asm/spinlock_types.h | 4 arch/powerpc/include/asm/stackprotector.h | 4 arch/powerpc/include/asm/thread_info.h | 16 arch/powerpc/kernel/asm-offsets.c | 1 arch/powerpc/kernel/entry_32.S | 27 arch/powerpc/kernel/entry_64.S | 28 arch/powerpc/kernel/irq.c | 2 arch/powerpc/kernel/misc_32.S | 2 arch/powerpc/kernel/misc_64.S | 2 arch/powerpc/kernel/traps.c | 8 arch/powerpc/kernel/watchdog.c | 5 arch/powerpc/kvm/Kconfig | 1 arch/powerpc/platforms/ps3/device-init.c | 4 arch/powerpc/platforms/pseries/iommu.c | 17 arch/riscv/kernel/entry.S | 4 arch/s390/Kconfig | 2 arch/s390/include/asm/preempt.h | 4 arch/s390/include/asm/spinlock_types.h | 4 arch/s390/kernel/dumpstack.c | 2 arch/s390/kernel/entry.S | 2 arch/sh/Kconfig | 2 arch/sh/include/asm/spinlock_types.h | 4 arch/sh/kernel/cpu/sh5/entry.S | 4 arch/sh/kernel/entry-common.S | 4 arch/sh/kernel/irq.c | 2 arch/sparc/Kconfig | 2 arch/sparc/kernel/irq_64.c | 2 arch/sparc/kernel/rtrap_64.S | 2 arch/x86/Kconfig | 6 arch/x86/crypto/aesni-intel_glue.c | 22 arch/x86/crypto/cast5_avx_glue.c | 21 arch/x86/crypto/chacha_glue.c | 11 arch/x86/crypto/glue_helper.c | 26 arch/x86/entry/common.c | 11 arch/x86/entry/entry_32.S | 18 arch/x86/entry/entry_64.S | 18 arch/x86/include/asm/fpu/api.h | 1 arch/x86/include/asm/preempt.h | 33 arch/x86/include/asm/signal.h | 13 arch/x86/include/asm/stackprotector.h | 8 arch/x86/include/asm/thread_info.h | 11 arch/x86/kernel/apic/io_apic.c | 16 arch/x86/kernel/asm-offsets.c | 5 arch/x86/kernel/cpu/mshyperv.c | 1 arch/x86/kernel/fpu/core.c | 12 arch/x86/kernel/irq_32.c | 2 arch/x86/kernel/process_32.c | 32 arch/x86/kvm/x86.c | 8 arch/x86/mm/highmem_32.c | 13 arch/x86/mm/iomap_32.c | 11 arch/x86/mm/tlb.c | 2 arch/xtensa/include/asm/spinlock_types.h | 4 arch/xtensa/kernel/entry.S | 2 arch/xtensa/kernel/traps.c | 7 block/blk-ioc.c | 3 block/blk-mq.c | 18 block/blk-softirq.c | 6 drivers/block/zram/zcomp.c | 13 drivers/block/zram/zcomp.h | 1 drivers/block/zram/zram_drv.c | 41 drivers/block/zram/zram_drv.h | 1 drivers/char/random.c | 11 drivers/char/tpm/tpm-dev-common.c | 1 drivers/char/tpm/tpm_tis.c | 29 drivers/clocksource/Kconfig | 7 drivers/clocksource/timer-atmel-tcb.c | 73 drivers/connector/cn_proc.c | 6 drivers/dma-buf/dma-buf.c | 8 drivers/dma-buf/dma-resv.c | 45 drivers/firmware/efi/efi.c | 5 drivers/gpu/drm/Kconfig | 2 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 6 drivers/gpu/drm/i915/display/intel_sprite.c | 13 drivers/gpu/drm/i915/gem/i915_gem_busy.c | 6 drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 19 drivers/gpu/drm/i915/gt/intel_engine.h | 1 drivers/gpu/drm/i915/gt/intel_engine_pm.c | 7 drivers/gpu/drm/i915/gt/intel_hangcheck.c | 2 drivers/gpu/drm/i915/gt/intel_reset.c | 2 drivers/gpu/drm/i915/i915_irq.c | 2 drivers/gpu/drm/i915/i915_request.c | 12 drivers/gpu/drm/i915/i915_trace.h | 6 drivers/gpu/drm/radeon/radeon_display.c | 2 drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c | 2 drivers/hv/hyperv_vmbus.h | 1 drivers/hv/vmbus_drv.c | 1 drivers/leds/trigger/Kconfig | 1 drivers/md/bcache/Kconfig | 1 drivers/md/raid5.c | 7 drivers/md/raid5.h | 1 drivers/media/platform/Kconfig | 2 drivers/net/phy/fixed_phy.c | 26 drivers/net/wireless/intersil/orinoco/orinoco_usb.c | 4 drivers/of/base.c | 133 drivers/of/dynamic.c | 2 drivers/of/of_private.h | 4 drivers/of/overlay.c | 10 drivers/pci/switch/switchtec.c | 22 drivers/scsi/fcoe/fcoe.c | 16 drivers/scsi/fcoe/fcoe_ctlr.c | 4 drivers/scsi/libfc/fc_exch.c | 4 drivers/thermal/intel/x86_pkg_temp_thermal.c | 24 drivers/tty/serial/8250/8250.h | 47 drivers/tty/serial/8250/8250_core.c | 17 drivers/tty/serial/8250/8250_fsl.c | 9 drivers/tty/serial/8250/8250_ingenic.c | 7 drivers/tty/serial/8250/8250_mtk.c | 29 drivers/tty/serial/8250/8250_port.c | 92 drivers/tty/serial/amba-pl011.c | 17 drivers/tty/serial/omap-serial.c | 12 drivers/usb/gadget/function/f_fs.c | 2 drivers/usb/gadget/legacy/inode.c | 4 drivers/video/backlight/Kconfig | 4 drivers/xen/preempt.c | 4 fs/afs/dir_silly.c | 2 fs/aio.c | 2 fs/btrfs/volumes.h | 2 fs/buffer.c | 21 fs/cifs/readdir.c | 2 fs/dcache.c | 50 fs/eventfd.c | 12 fs/eventpoll.c | 4 fs/ext4/page-io.c | 8 fs/fscache/cookie.c | 8 fs/fscache/internal.h | 1 fs/fscache/main.c | 7 fs/fscache/object.c | 13 fs/fuse/readdir.c | 2 fs/inode.c | 2 fs/jbd2/commit.c | 13 fs/jbd2/journal.c | 30 fs/jbd2/transaction.c | 144 fs/namei.c | 4 fs/namespace.c | 8 fs/nfs/delegation.c | 4 fs/nfs/dir.c | 12 fs/nfs/inode.c | 4 fs/nfs/nfs4_fs.h | 2 fs/nfs/nfs4proc.c | 4 fs/nfs/nfs4state.c | 22 fs/nfs/unlink.c | 35 fs/ntfs/aops.c | 9 fs/ocfs2/suballoc.c | 19 fs/proc/base.c | 3 fs/proc/kmsg.c | 4 fs/proc/proc_sysctl.c | 2 fs/squashfs/decompressor_multi_percpu.c | 16 fs/stack.c | 6 fs/userfaultfd.c | 12 include/linux/bottom_half.h | 5 include/linux/bpf.h | 38 include/linux/buffer_head.h | 6 include/linux/cgroup-defs.h | 5 include/linux/cgroup.h | 3 include/linux/completion.h | 8 include/linux/console.h | 6 include/linux/dcache.h | 4 include/linux/delay.h | 6 include/linux/dma-resv.h | 4 include/linux/eventfd.h | 11 include/linux/filter.h | 33 include/linux/fs.h | 6 include/linux/fscache.h | 1 include/linux/genhd.h | 6 include/linux/hardirq.h | 2 include/linux/highmem.h | 32 include/linux/idr.h | 5 include/linux/interrupt.h | 8 include/linux/irq_work.h | 8 include/linux/irqflags.h | 23 include/linux/jbd2.h | 27 include/linux/journal-head.h | 21 include/linux/kernel.h | 12 include/linux/kmsg_dump.h | 6 include/linux/list_bl.h | 30 include/linux/locallock.h | 282 + include/linux/mm_types.h | 4 include/linux/mutex.h | 20 include/linux/mutex_rt.h | 130 include/linux/netdevice.h | 1 include/linux/nfs_fs.h | 4 include/linux/nfs_xdr.h | 2 include/linux/percpu-refcount.h | 16 include/linux/percpu-rwsem.h | 83 include/linux/percpu.h | 29 include/linux/pid.h | 1 include/linux/posix-timers.h | 11 include/linux/preempt.h | 139 include/linux/printk.h | 44 include/linux/printk_ringbuffer.h | 114 include/linux/radix-tree.h | 6 include/linux/ratelimit.h | 2 include/linux/rbtree.h | 2 include/linux/rcu_assign_pointer.h | 62 include/linux/rcupdate.h | 70 include/linux/rtmutex.h | 22 include/linux/rwlock_rt.h | 119 include/linux/rwlock_types.h | 4 include/linux/rwlock_types_rt.h | 55 include/linux/rwsem-rt.h | 69 include/linux/rwsem.h | 18 include/linux/sched.h | 153 include/linux/sched/mm.h | 10 include/linux/sched/wake_q.h | 13 include/linux/seqlock.h | 66 include/linux/serial_8250.h | 5 include/linux/signal.h | 1 include/linux/skbuff.h | 7 include/linux/smp.h | 16 include/linux/spinlock.h | 12 include/linux/spinlock_api_smp.h | 4 include/linux/spinlock_rt.h | 156 include/linux/spinlock_types.h | 76 include/linux/spinlock_types_nort.h | 33 include/linux/spinlock_types_raw.h | 55 include/linux/spinlock_types_rt.h | 48 include/linux/spinlock_types_up.h | 4 include/linux/stop_machine.h | 2 include/linux/swait.h | 2 include/linux/swap.h | 2 include/linux/thread_info.h | 12 include/linux/trace_events.h | 2 include/linux/u64_stats_sync.h | 61 include/linux/uaccess.h | 2 include/linux/vmstat.h | 4 include/linux/wait.h | 1 include/net/gen_stats.h | 11 include/net/neighbour.h | 6 include/net/net_seq_lock.h | 15 include/net/sch_generic.h | 19 include/xen/xen-ops.h | 4 init/Kconfig | 7 init/init_task.c | 4 kernel/Kconfig.locks | 12 kernel/Kconfig.preempt | 6 kernel/bpf/hashtab.c | 144 kernel/bpf/lpm_trie.c | 12 kernel/bpf/percpu_freelist.c | 20 kernel/bpf/stackmap.c | 17 kernel/bpf/syscall.c | 21 kernel/bpf/verifier.c | 40 kernel/cgroup/cgroup.c | 12 kernel/cgroup/cpuset.c | 70 kernel/cgroup/rstat.c | 55 kernel/cpu.c | 48 kernel/events/core.c | 4 kernel/exit.c | 3 kernel/fork.c | 27 kernel/futex.c | 107 kernel/irq/manage.c | 2 kernel/irq/spurious.c | 8 kernel/irq_work.c | 59 kernel/kexec_core.c | 1 kernel/ksysfs.c | 12 kernel/locking/Makefile | 10 kernel/locking/lockdep.c | 2 kernel/locking/locktorture.c | 1 kernel/locking/mutex-rt.c | 223 + kernel/locking/percpu-rwsem.c | 194 - kernel/locking/rtmutex.c | 944 ++++ kernel/locking/rtmutex_common.h | 29 kernel/locking/rwlock-rt.c | 384 ++ kernel/locking/rwsem-rt.c | 312 + kernel/locking/rwsem.c | 14 kernel/locking/rwsem.h | 10 kernel/locking/spinlock.c | 7 kernel/locking/spinlock_debug.c | 5 kernel/panic.c | 5 kernel/printk/Makefile | 1 kernel/printk/internal.h | 72 kernel/printk/printk.c | 1919 ++++------ kernel/printk/printk_safe.c | 414 -- kernel/ptrace.c | 32 kernel/rcu/Kconfig | 8 kernel/rcu/rcutorture.c | 98 kernel/rcu/srcutiny.c | 2 kernel/rcu/srcutree.c | 13 kernel/rcu/tree.c | 9 kernel/rcu/tree_exp.h | 2 kernel/rcu/tree_plugin.h | 10 kernel/rcu/update.c | 4 kernel/sched/completion.c | 34 kernel/sched/core.c | 408 +- kernel/sched/debug.c | 4 kernel/sched/fair.c | 16 kernel/sched/features.h | 8 kernel/sched/sched.h | 14 kernel/sched/swait.c | 17 kernel/sched/topology.c | 1 kernel/seccomp.c | 4 kernel/signal.c | 105 kernel/smp.c | 101 kernel/softirq.c | 231 + kernel/stop_machine.c | 7 kernel/sysctl.c | 29 kernel/time/hrtimer.c | 32 kernel/time/posix-cpu-timers.c | 203 - kernel/time/tick-sched.c | 10 kernel/time/timer.c | 11 kernel/trace/bpf_trace.c | 7 kernel/trace/trace.c | 29 kernel/trace/trace.h | 2 kernel/trace/trace_events.c | 2 kernel/trace/trace_output.c | 19 kernel/trace/trace_uprobe.c | 11 kernel/up.c | 12 kernel/workqueue.c | 191 lib/Kconfig.debug | 21 lib/Makefile | 2 lib/bust_spinlocks.c | 3 lib/debugobjects.c | 5 lib/irq_poll.c | 5 lib/locking-selftest.c | 50 lib/nmi_backtrace.c | 6 lib/printk_ringbuffer.c | 589 +++ lib/radix-tree.c | 28 lib/scatterlist.c | 2 lib/smp_processor_id.c | 7 lib/test_bpf.c | 4 localversion-rt | 1 mm/Kconfig | 2 mm/compaction.c | 10 mm/highmem.c | 6 mm/kmemleak.c | 112 mm/memcontrol.c | 30 mm/memory.c | 2 mm/page_alloc.c | 190 mm/slab.c | 90 mm/slab.h | 2 mm/slub.c | 163 mm/swap.c | 79 mm/vmalloc.c | 50 mm/vmstat.c | 12 mm/workingset.c | 5 mm/zsmalloc.c | 80 mm/zswap.c | 23 net/Kconfig | 2 net/bluetooth/rfcomm/sock.c | 7 net/bpf/test_run.c | 8 net/core/dev.c | 51 net/core/flow_dissector.c | 4 net/core/gen_estimator.c | 6 net/core/gen_stats.c | 12 net/core/skmsg.c | 8 net/kcm/kcmsock.c | 4 net/netfilter/nft_counter.c | 78 net/packet/af_packet.c | 5 net/sched/sch_api.c | 2 net/sched/sch_generic.c | 15 net/sunrpc/svc_xprt.c | 4 security/apparmor/include/path.h | 19 security/apparmor/lsm.c | 2 virt/kvm/arm/arch_timer.c | 8 virt/kvm/arm/arm.c | 6 414 files changed, 9921 insertions(+), 3895 deletions(-) Index: linux-5.4.290-rt95/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html =================================================================== --- linux-5.4.290-rt95.orig/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html +++ linux-5.4.290-rt95/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html @@ -56,8 +56,8 @@ sections. RCU-preempt Expedited Grace Periods

-CONFIG_PREEMPT=y kernels implement RCU-preempt. -The overall flow of the handling of a given CPU by an RCU-preempt +CONFIG_PREEMPT=y and CONFIG_PREEMPT_RT=y kernels implement +RCU-preempt. The overall flow of the handling of a given CPU by an RCU-preempt expedited grace period is shown in the following diagram:

ExpRCUFlow.svg @@ -140,8 +140,8 @@ or offline, among other things. RCU-sched Expedited Grace Periods

-CONFIG_PREEMPT=n kernels implement RCU-sched. -The overall flow of the handling of a given CPU by an RCU-sched +CONFIG_PREEMPT=n and CONFIG_PREEMPT_RT=n kernels implement +RCU-sched. The overall flow of the handling of a given CPU by an RCU-sched expedited grace period is shown in the following diagram:

ExpSchedFlow.svg Index: linux-5.4.290-rt95/Documentation/RCU/Design/Requirements/Requirements.html =================================================================== --- linux-5.4.290-rt95.orig/Documentation/RCU/Design/Requirements/Requirements.html +++ linux-5.4.290-rt95/Documentation/RCU/Design/Requirements/Requirements.html @@ -106,7 +106,7 @@ big RCU read-side critical section. Production-quality implementations of rcu_read_lock() and rcu_read_unlock() are extremely lightweight, and in fact have exactly zero overhead in Linux kernels built for production -use with CONFIG_PREEMPT=n. +use with CONFIG_PREEMPTION=n.

This guarantee allows ordering to be enforced with extremely low @@ -1499,7 +1499,7 @@ costs have plummeted. However, as I learned from Matt Mackall's bloatwatch efforts, memory footprint is critically important on single-CPU systems with -non-preemptible (CONFIG_PREEMPT=n) kernels, and thus +non-preemptible (CONFIG_PREEMPTION=n) kernels, and thus tiny RCU was born. Josh Triplett has since taken over the small-memory banner with his @@ -1887,7 +1887,7 @@ constructs, there are limitations.

Implementations of RCU for which rcu_read_lock() and rcu_read_unlock() generate no code, such as -Linux-kernel RCU when CONFIG_PREEMPT=n, can be +Linux-kernel RCU when CONFIG_PREEMPTION=n, can be nested arbitrarily deeply. After all, there is no overhead. Except that if all these instances of rcu_read_lock() @@ -2229,7 +2229,7 @@ be a no-op.

However, once the scheduler has spawned its first kthread, this early boot trick fails for synchronize_rcu() (as well as for -synchronize_rcu_expedited()) in CONFIG_PREEMPT=y +synchronize_rcu_expedited()) in CONFIG_PREEMPTION=y kernels. The reason is that an RCU read-side critical section might be preempted, which means that a subsequent synchronize_rcu() really does have @@ -2568,7 +2568,7 @@ the following:

If the compiler did make this transformation in a -CONFIG_PREEMPT=n kernel build, and if get_user() did +CONFIG_PREEMPTION=n kernel build, and if get_user() did page fault, the result would be a quiescent state in the middle of an RCU read-side critical section. This misplaced quiescent state could result in line 4 being @@ -2906,7 +2906,7 @@ in conjunction with the The real-time-latency response requirements are such that the traditional approach of disabling preemption across RCU read-side critical sections is inappropriate. -Kernels built with CONFIG_PREEMPT=y therefore +Kernels built with CONFIG_PREEMPTION=y therefore use an RCU implementation that allows RCU read-side critical sections to be preempted. This requirement made its presence known after users made it @@ -3064,7 +3064,7 @@ includes rcu_barrier_bh(), and rcu_read_lock_bh_held(). However, the update-side APIs are now simple wrappers for other RCU -flavors, namely RCU-sched in CONFIG_PREEMPT=n kernels and RCU-preempt +flavors, namely RCU-sched in CONFIG_PREEMPTION=n kernels and RCU-preempt otherwise.

Sched Flavor (Historical)

@@ -3088,12 +3088,12 @@ of an RCU read-side critical section can Therefore, RCU-sched was created, which follows “classic” RCU in that an RCU-sched grace period waits for for pre-existing interrupt and NMI handlers. -In kernels built with CONFIG_PREEMPT=n, the RCU and RCU-sched +In kernels built with CONFIG_PREEMPTION=n, the RCU and RCU-sched APIs have identical implementations, while kernels built with -CONFIG_PREEMPT=y provide a separate implementation for each. +CONFIG_PREEMPTION=y provide a separate implementation for each.

-Note well that in CONFIG_PREEMPT=y kernels, +Note well that in CONFIG_PREEMPTION=y kernels, rcu_read_lock_sched() and rcu_read_unlock_sched() disable and re-enable preemption, respectively. This means that if there was a preemption attempt during the @@ -3302,12 +3302,12 @@ The tasks-RCU API is quite compact, cons call_rcu_tasks(), synchronize_rcu_tasks(), and rcu_barrier_tasks(). -In CONFIG_PREEMPT=n kernels, trampolines cannot be preempted, +In CONFIG_PREEMPTION=n kernels, trampolines cannot be preempted, so these APIs map to call_rcu(), synchronize_rcu(), and rcu_barrier(), respectively. -In CONFIG_PREEMPT=y kernels, trampolines can be preempted, +In CONFIG_PREEMPTION=y kernels, trampolines can be preempted, and these three APIs are therefore implemented by separate functions that check for voluntary context switches. Index: linux-5.4.290-rt95/Documentation/RCU/checklist.txt =================================================================== --- linux-5.4.290-rt95.orig/Documentation/RCU/checklist.txt +++ linux-5.4.290-rt95/Documentation/RCU/checklist.txt @@ -210,8 +210,8 @@ over a rather long period of time, but i the rest of the system. 7. As of v4.20, a given kernel implements only one RCU flavor, - which is RCU-sched for PREEMPT=n and RCU-preempt for PREEMPT=y. - If the updater uses call_rcu() or synchronize_rcu(), + which is RCU-sched for PREEMPTION=n and RCU-preempt for + PREEMPTION=y. If the updater uses call_rcu() or synchronize_rcu(), then the corresponding readers my use rcu_read_lock() and rcu_read_unlock(), rcu_read_lock_bh() and rcu_read_unlock_bh(), or any pair of primitives that disables and re-enables preemption, Index: linux-5.4.290-rt95/Documentation/RCU/rcubarrier.txt =================================================================== --- linux-5.4.290-rt95.orig/Documentation/RCU/rcubarrier.txt +++ linux-5.4.290-rt95/Documentation/RCU/rcubarrier.txt @@ -6,8 +6,8 @@ RCU (read-copy update) is a synchronizat of as a replacement for read-writer locking (among other things), but with very low-overhead readers that are immune to deadlock, priority inversion, and unbounded latency. RCU read-side critical sections are delimited -by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPT -kernels, generate no code whatsoever. +by rcu_read_lock() and rcu_read_unlock(), which, in +non-CONFIG_PREEMPTION kernels, generate no code whatsoever. This means that RCU writers are unaware of the presence of concurrent readers, so that RCU updates to shared data must be undertaken quite @@ -303,10 +303,10 @@ Answer: This cannot happen. The reason i to smp_call_function() and further to smp_call_function_on_cpu(), causing this latter to spin until the cross-CPU invocation of rcu_barrier_func() has completed. This by itself would prevent - a grace period from completing on non-CONFIG_PREEMPT kernels, + a grace period from completing on non-CONFIG_PREEMPTION kernels, since each CPU must undergo a context switch (or other quiescent state) before the grace period can complete. However, this is - of no use in CONFIG_PREEMPT kernels. + of no use in CONFIG_PREEMPTION kernels. Therefore, on_each_cpu() disables preemption across its call to smp_call_function() and also across the local call to Index: linux-5.4.290-rt95/Documentation/RCU/stallwarn.txt =================================================================== --- linux-5.4.290-rt95.orig/Documentation/RCU/stallwarn.txt +++ linux-5.4.290-rt95/Documentation/RCU/stallwarn.txt @@ -20,7 +20,7 @@ o A CPU looping with preemption disabled o A CPU looping with bottom halves disabled. -o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel +o For !CONFIG_PREEMPTION kernels, a CPU looping anywhere in the kernel without invoking schedule(). If the looping in the kernel is really expected and desirable behavior, you might need to add some calls to cond_resched(). @@ -39,7 +39,7 @@ o Anything that prevents RCU's grace-per result in the "rcu_.*kthread starved for" console-log message, which will include additional debugging information. -o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might +o A CPU-bound real-time task in a CONFIG_PREEMPTION kernel, which might happen to preempt a low-priority task in the middle of an RCU read-side critical section. This is especially damaging if that low-priority task is not permitted to run on any other CPU, Index: linux-5.4.290-rt95/Documentation/RCU/whatisRCU.txt =================================================================== --- linux-5.4.290-rt95.orig/Documentation/RCU/whatisRCU.txt +++ linux-5.4.290-rt95/Documentation/RCU/whatisRCU.txt @@ -648,9 +648,10 @@ Quick Quiz #1: Why is this argument naiv This section presents a "toy" RCU implementation that is based on "classic RCU". It is also short on performance (but only for updates) and -on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT -kernels. The definitions of rcu_dereference() and rcu_assign_pointer() -are the same as those shown in the preceding section, so they are omitted. +on features such as hotplug CPU and the ability to run in +CONFIG_PREEMPTION kernels. The definitions of rcu_dereference() and +rcu_assign_pointer() are the same as those shown in the preceding +section, so they are omitted. void rcu_read_lock(void) { } Index: linux-5.4.290-rt95/Documentation/admin-guide/sysctl/vm.rst =================================================================== --- linux-5.4.290-rt95.orig/Documentation/admin-guide/sysctl/vm.rst +++ linux-5.4.290-rt95/Documentation/admin-guide/sysctl/vm.rst @@ -129,6 +129,9 @@ allowed to examine the unevictable lru ( This should be used on systems where stalls for minor page faults are an acceptable trade for large contiguous free memory. Set to 0 to prevent compaction from moving pages that are unevictable. Default value is 1. +On CONFIG_PREEMPT_RT the default value is 0 in order to avoid a page fault, due +to compaction, which would block the task from becomming active until the fault +is resolved. dirty_background_bytes Index: linux-5.4.290-rt95/Documentation/printk-ringbuffer.txt =================================================================== --- /dev/null +++ linux-5.4.290-rt95/Documentation/printk-ringbuffer.txt @@ -0,0 +1,377 @@ +struct printk_ringbuffer +------------------------ +John Ogness + +Overview +~~~~~~~~ +As the name suggests, this ring buffer was implemented specifically to serve +the needs of the printk() infrastructure. The ring buffer itself is not +specific to printk and could be used for other purposes. _However_, the +requirements and semantics of printk are rather unique. If you intend to use +this ring buffer for anything other than printk, you need to be very clear on +its features, behavior, and pitfalls. + +Features +^^^^^^^^ +The printk ring buffer has the following features: + +- single global buffer +- resides in initialized data section (available at early boot) +- lockless readers +- supports multiple writers +- supports multiple non-consuming readers +- safe from any context (including NMI) +- groups bytes into variable length blocks (referenced by entries) +- entries tagged with sequence numbers + +Behavior +^^^^^^^^ +Since the printk ring buffer readers are lockless, there exists no +synchronization between readers and writers. Basically writers are the tasks +in control and may overwrite any and all committed data at any time and from +any context. For this reason readers can miss entries if they are overwritten +before the reader was able to access the data. The reader API implementation +is such that reader access to entries is atomic, so there is no risk of +readers having to deal with partial or corrupt data. Also, entries are +tagged with sequence numbers so readers can recognize if entries were missed. + +Writing to the ring buffer consists of 2 steps. First a writer must reserve +an entry of desired size. After this step the writer has exclusive access +to the memory region. Once the data has been written to memory, it needs to +be committed to the ring buffer. After this step the entry has been inserted +into the ring buffer and assigned an appropriate sequence number. + +Once committed, a writer must no longer access the data directly. This is +because the data may have been overwritten and no longer exists. If a +writer must access the data, it should either keep a private copy before +committing the entry or use the reader API to gain access to the data. + +Because of how the data backend is implemented, entries that have been +reserved but not yet committed act as barriers, preventing future writers +from filling the ring buffer beyond the location of the reserved but not +yet committed entry region. For this reason it is *important* that writers +perform both reserve and commit as quickly as possible. Also, be aware that +preemption and local interrupts are disabled and writing to the ring buffer +is processor-reentrant locked during the reserve/commit window. Writers in +NMI contexts can still preempt any other writers, but as long as these +writers do not write a large amount of data with respect to the ring buffer +size, this should not become an issue. + +API +~~~ + +Declaration +^^^^^^^^^^^ +The printk ring buffer can be instantiated as a static structure: + + /* declare a static struct printk_ringbuffer */ + #define DECLARE_STATIC_PRINTKRB(name, szbits, cpulockptr) + +The value of szbits specifies the size of the ring buffer in bits. The +cpulockptr field is a pointer to a prb_cpulock struct that is used to +perform processor-reentrant spin locking for the writers. It is specified +externally because it may be used for multiple ring buffers (or other +code) to synchronize writers without risk of deadlock. + +Here is an example of a declaration of a printk ring buffer specifying a +32KB (2^15) ring buffer: + +.... +DECLARE_STATIC_PRINTKRB_CPULOCK(rb_cpulock); +DECLARE_STATIC_PRINTKRB(rb, 15, &rb_cpulock); +.... + +If writers will be using multiple ring buffers and the ordering of that usage +is not clear, the same prb_cpulock should be used for both ring buffers. + +Writer API +^^^^^^^^^^ +The writer API consists of 2 functions. The first is to reserve an entry in +the ring buffer, the second is to commit that data to the ring buffer. The +reserved entry information is stored within a provided `struct prb_handle`. + + /* reserve an entry */ + char *prb_reserve(struct prb_handle *h, struct printk_ringbuffer *rb, + unsigned int size); + + /* commit a reserved entry to the ring buffer */ + void prb_commit(struct prb_handle *h); + +Here is an example of a function to write data to a ring buffer: + +.... +int write_data(struct printk_ringbuffer *rb, char *data, int size) +{ + struct prb_handle h; + char *buf; + + buf = prb_reserve(&h, rb, size); + if (!buf) + return -1; + memcpy(buf, data, size); + prb_commit(&h); + + return 0; +} +.... + +Pitfalls +++++++++ +Be aware that prb_reserve() can fail. A retry might be successful, but it +depends entirely on whether or not the next part of the ring buffer to +overwrite belongs to reserved but not yet committed entries of other writers. +Writers can use the prb_inc_lost() function to allow readers to notice that a +message was lost. + +Reader API +^^^^^^^^^^ +The reader API utilizes a `struct prb_iterator` to track the reader's +position in the ring buffer. + + /* declare a pre-initialized static iterator for a ring buffer */ + #define DECLARE_STATIC_PRINTKRB_ITER(name, rbaddr) + + /* initialize iterator for a ring buffer (if static macro NOT used) */ + void prb_iter_init(struct prb_iterator *iter, + struct printk_ringbuffer *rb, u64 *seq); + + /* make a deep copy of an iterator */ + void prb_iter_copy(struct prb_iterator *dest, + struct prb_iterator *src); + + /* non-blocking, advance to next entry (and read the data) */ + int prb_iter_next(struct prb_iterator *iter, char *buf, + int size, u64 *seq); + + /* blocking, advance to next entry (and read the data) */ + int prb_iter_wait_next(struct prb_iterator *iter, char *buf, + int size, u64 *seq); + + /* position iterator at the entry seq */ + int prb_iter_seek(struct prb_iterator *iter, u64 seq); + + /* read data at current position */ + int prb_iter_data(struct prb_iterator *iter, char *buf, + int size, u64 *seq); + +Typically prb_iter_data() is not needed because the data can be retrieved +directly with prb_iter_next(). + +Here is an example of a non-blocking function that will read all the data in +a ring buffer: + +.... +void read_all_data(struct printk_ringbuffer *rb, char *buf, int size) +{ + struct prb_iterator iter; + u64 prev_seq = 0; + u64 seq; + int ret; + + prb_iter_init(&iter, rb, NULL); + + for (;;) { + ret = prb_iter_next(&iter, buf, size, &seq); + if (ret > 0) { + if (seq != ++prev_seq) { + /* "seq - prev_seq" entries missed */ + prev_seq = seq; + } + /* process buf here */ + } else if (ret == 0) { + /* hit the end, done */ + break; + } else if (ret < 0) { + /* + * iterator is invalid, a writer overtook us, reset the + * iterator and keep going, entries were missed + */ + prb_iter_init(&iter, rb, NULL); + } + } +} +.... + +Pitfalls +++++++++ +The reader's iterator can become invalid at any time because the reader was +overtaken by a writer. Typically the reader should reset the iterator back +to the current oldest entry (which will be newer than the entry the reader +was at) and continue, noting the number of entries that were missed. + +Utility API +^^^^^^^^^^^ +Several functions are available as convenience for external code. + + /* query the size of the data buffer */ + int prb_buffer_size(struct printk_ringbuffer *rb); + + /* skip a seq number to signify a lost record */ + void prb_inc_lost(struct printk_ringbuffer *rb); + + /* processor-reentrant spin lock */ + void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store); + + /* processor-reentrant spin unlock */ + void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store); + +Pitfalls +++++++++ +Although the value returned by prb_buffer_size() does represent an absolute +upper bound, the amount of data that can be stored within the ring buffer +is actually less because of the additional storage space of a header for each +entry. + +The prb_lock() and prb_unlock() functions can be used to synchronize between +ring buffer writers and other external activities. The function of a +processor-reentrant spin lock is to disable preemption and local interrupts +and synchronize against other processors. It does *not* protect against +multiple contexts of a single processor, i.e NMI. + +Implementation +~~~~~~~~~~~~~~ +This section describes several of the implementation concepts and details to +help developers better understand the code. + +Entries +^^^^^^^ +All ring buffer data is stored within a single static byte array. The reason +for this is to ensure that any pointers to the data (past and present) will +always point to valid memory. This is important because the lockless readers +may be preempted for long periods of time and when they resume may be working +with expired pointers. + +Entries are identified by start index and size. (The start index plus size +is the start index of the next entry.) The start index is not simply an +offset into the byte array, but rather a logical position (lpos) that maps +directly to byte array offsets. + +For example, for a byte array of 1000, an entry may have have a start index +of 100. Another entry may have a start index of 1100. And yet another 2100. +All of these entry are pointing to the same memory region, but only the most +recent entry is valid. The other entries are pointing to valid memory, but +represent entries that have been overwritten. + +Note that due to overflowing, the most recent entry is not necessarily the one +with the highest lpos value. Indeed, the printk ring buffer initializes its +data such that an overflow happens relatively quickly in order to validate the +handling of this situation. The implementation assumes that an lpos (unsigned +long) will never completely wrap while a reader is preempted. If this were to +become an issue, the seq number (which never wraps) could be used to increase +the robustness of handling this situation. + +Buffer Wrapping +^^^^^^^^^^^^^^^ +If an entry starts near the end of the byte array but would extend beyond it, +a special terminating entry (size = -1) is inserted into the byte array and +the real entry is placed at the beginning of the byte array. This can waste +space at the end of the byte array, but simplifies the implementation by +allowing writers to always work with contiguous buffers. + +Note that the size field is the first 4 bytes of the entry header. Also note +that calc_next() always ensures that there are at least 4 bytes left at the +end of the byte array to allow room for a terminating entry. + +Ring Buffer Pointers +^^^^^^^^^^^^^^^^^^^^ +Three pointers (lpos values) are used to manage the ring buffer: + + - _tail_: points to the oldest entry + - _head_: points to where the next new committed entry will be + - _reserve_: points to where the next new reserved entry will be + +These pointers always maintain a logical ordering: + + tail <= head <= reserve + +The reserve pointer moves forward when a writer reserves a new entry. The +head pointer moves forward when a writer commits a new entry. + +The reserve pointer cannot overwrite the tail pointer in a wrap situation. In +such a situation, the tail pointer must be "pushed forward", thus +invalidating that oldest entry. Readers identify if they are accessing a +valid entry by ensuring their entry pointer is `>= tail && < head`. + +If the tail pointer is equal to the head pointer, it cannot be pushed and any +reserve operation will fail. The only resolution is for writers to commit +their reserved entries. + +Processor-Reentrant Locking +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The purpose of the processor-reentrant locking is to limit the interruption +scenarios of writers to 2 contexts. This allows for a simplified +implementation where: + +- The reserve/commit window only exists on 1 processor at a time. A reserve + can never fail due to uncommitted entries of other processors. + +- When committing entries, it is trivial to handle the situation when + subsequent entries have already been committed, i.e. managing the head + pointer. + +Performance +~~~~~~~~~~~ +Some basic tests were performed on a quad Intel(R) Xeon(R) CPU E5-2697 v4 at +2.30GHz (36 cores / 72 threads). All tests involved writing a total of +32,000,000 records at an average of 33 bytes each. Each writer was pinned to +its own CPU and would write as fast as it could until a total of 32,000,000 +records were written. All tests involved 2 readers that were both pinned +together to another CPU. Each reader would read as fast as it could and track +how many of the 32,000,000 records it could read. All tests used a ring buffer +of 16KB in size, which holds around 350 records (header + data for each +entry). + +The only difference between the tests is the number of writers (and thus also +the number of records per writer). As more writers are added, the time to +write a record increases. This is because data pointers, modified via cmpxchg, +and global data access in general become more contended. + +1 writer +^^^^^^^^ + runtime: 0m 18s + reader1: 16219900/32000000 (50%) records + reader2: 16141582/32000000 (50%) records + +2 writers +^^^^^^^^^ + runtime: 0m 32s + reader1: 16327957/32000000 (51%) records + reader2: 16313988/32000000 (50%) records + +4 writers +^^^^^^^^^ + runtime: 0m 42s + reader1: 16421642/32000000 (51%) records + reader2: 16417224/32000000 (51%) records + +8 writers +^^^^^^^^^ + runtime: 0m 43s + reader1: 16418300/32000000 (51%) records + reader2: 16432222/32000000 (51%) records + +16 writers +^^^^^^^^^^ + runtime: 0m 54s + reader1: 16539189/32000000 (51%) records + reader2: 16542711/32000000 (51%) records + +32 writers +^^^^^^^^^^ + runtime: 1m 13s + reader1: 16731808/32000000 (52%) records + reader2: 16735119/32000000 (52%) records + +Comments +^^^^^^^^ +It is particularly interesting to compare/contrast the 1-writer and 32-writer +tests. Despite the writing of the 32,000,000 records taking over 4 times +longer, the readers (which perform no cmpxchg) were still unable to keep up. +This shows that the memory contention between the increasing number of CPUs +also has a dramatic effect on readers. + +It should also be noted that in all cases each reader was able to read >=50% +of the records. This means that a single reader would have been able to keep +up with the writer(s) in all cases, becoming slightly easier as more writers +are added. This was the purpose of pinning 2 readers to 1 CPU: to observe how +maximum reader performance changes. Index: linux-5.4.290-rt95/Documentation/trace/ftrace-uses.rst =================================================================== --- linux-5.4.290-rt95.orig/Documentation/trace/ftrace-uses.rst +++ linux-5.4.290-rt95/Documentation/trace/ftrace-uses.rst @@ -146,7 +146,7 @@ FTRACE_OPS_FL_RECURSION_SAFE itself or any nested functions that those functions call. If this flag is set, it is possible that the callback will also - be called with preemption enabled (when CONFIG_PREEMPT is set), + be called with preemption enabled (when CONFIG_PREEMPTION is set), but this is not guaranteed. FTRACE_OPS_FL_IPMODIFY Index: linux-5.4.290-rt95/arch/Kconfig =================================================================== --- linux-5.4.290-rt95.orig/arch/Kconfig +++ linux-5.4.290-rt95/arch/Kconfig @@ -31,6 +31,7 @@ config OPROFILE tristate "OProfile system profiling" depends on PROFILING depends on HAVE_OPROFILE + depends on !PREEMPT_RT select RING_BUFFER select RING_BUFFER_ALLOW_SWAP help Index: linux-5.4.290-rt95/arch/alpha/include/asm/spinlock_types.h =================================================================== --- linux-5.4.290-rt95.orig/arch/alpha/include/asm/spinlock_types.h +++ linux-5.4.290-rt95/arch/alpha/include/asm/spinlock_types.h @@ -2,10 +2,6 @@ #ifndef _ALPHA_SPINLOCK_TYPES_H #define _ALPHA_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H -# error "please don't include this file directly" -#endif - typedef struct { volatile unsigned int lock; } arch_spinlock_t; Index: linux-5.4.290-rt95/arch/arc/kernel/entry.S =================================================================== --- linux-5.4.290-rt95.orig/arch/arc/kernel/entry.S +++ linux-5.4.290-rt95/arch/arc/kernel/entry.S @@ -344,11 +344,11 @@ resume_user_mode_begin: resume_kernel_mode: ; Disable Interrupts from this point on - ; CONFIG_PREEMPT: This is a must for preempt_schedule_irq() - ; !CONFIG_PREEMPT: To ensure restore_regs is intr safe + ; CONFIG_PREEMPTION: This is a must for preempt_schedule_irq() + ; !CONFIG_PREEMPTION: To ensure restore_regs is intr safe IRQ_DISABLE r9 -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION ; Can't preempt if preemption disabled GET_CURR_THR_INFO_FROM_SP r10 Index: linux-5.4.290-rt95/arch/arm/Kconfig =================================================================== --- linux-5.4.290-rt95.orig/arch/arm/Kconfig +++ linux-5.4.290-rt95/arch/arm/Kconfig @@ -33,6 +33,7 @@ config ARM select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_RT select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU @@ -65,7 +66,7 @@ config ARM select HARDIRQS_SW_RESEND select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 - select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU + select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_SECCOMP_FILTER if AEABI && !OABI_COMPAT @@ -105,6 +106,7 @@ config ARM select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_LAZY select HAVE_RCU_TABLE_FREE if SMP && ARM_LPAE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RSEQ Index: linux-5.4.290-rt95/arch/arm/include/asm/irq.h =================================================================== --- linux-5.4.290-rt95.orig/arch/arm/include/asm/irq.h +++ linux-5.4.290-rt95/arch/arm/include/asm/irq.h @@ -23,6 +23,8 @@ #endif #ifndef __ASSEMBLY__ +#include + struct irqaction; struct pt_regs; Index: linux-5.4.290-rt95/arch/arm/include/asm/spinlock_types.h =================================================================== --- linux-5.4.290-rt95.orig/arch/arm/include/asm/spinlock_types.h +++ linux-5.4.290-rt95/arch/arm/include/asm/spinlock_types.h @@ -2,10 +2,6 @@ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H -# error "please don't include this file directly" -#endif - #define TICKET_SHIFT 16 typedef struct { Index: linux-5.4.290-rt95/arch/arm/include/asm/switch_to.h =================================================================== --- linux-5.4.290-rt95.orig/arch/arm/include/asm/switch_to.h +++ linux-5.4.290-rt95/arch/arm/include/asm/switch_to.h @@ -4,13 +4,20 @@ #include +#if defined CONFIG_PREEMPT_RT && defined CONFIG_HIGHMEM +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p); +#else +static inline void +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { } +#endif + /* * For v7 SMP cores running a preemptible kernel we may be pre-empted * during a TLB maintenance operation, so execute an inner-shareable dsb * to ensure that the maintenance completes in case we migrate to another * CPU. */ -#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) && defined(CONFIG_CPU_V7) +#if defined(CONFIG_PREEMPTION) && defined(CONFIG_SMP) && defined(CONFIG_CPU_V7) #define __complete_pending_tlbi() dsb(ish) #else #define __complete_pending_tlbi() @@ -26,6 +33,7 @@ extern struct task_struct *__switch_to(s #define switch_to(prev,next,last) \ do { \ __complete_pending_tlbi(); \ + switch_kmaps(prev, next); \ last = __switch_to(prev,task_thread_info(prev), task_thread_info(next)); \ } while (0) Index: linux-5.4.290-rt95/arch/arm/include/asm/thread_info.h =================================================================== --- linux-5.4.290-rt95.orig/arch/arm/include/asm/thread_info.h +++ linux-5.4.290-rt95/arch/arm/include/asm/thread_info.h @@ -46,6 +46,7 @@ struct cpu_context_save { struct thread_info { unsigned long flags; /* low level flags */ int preempt_count; /* 0 => preemptable, <0 => bug */ + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ mm_segment_t addr_limit; /* address limit */ struct task_struct *task; /* main task structure */ __u32 cpu; /* cpu */ @@ -139,7 +140,8 @@ extern int vfp_restore_user_hwstate(stru #define TIF_SYSCALL_TRACE 4 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */ #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ -#define TIF_SECCOMP 7 /* seccomp syscall filtering active */ +#define TIF_SECCOMP 8 /* seccomp syscall filtering active */ +#define TIF_NEED_RESCHED_LAZY 7 #define TIF_NOHZ 12 /* in adaptive nohz mode */ #define TIF_USING_IWMMXT 17 @@ -149,6 +151,7 @@ extern int vfp_restore_user_hwstate(stru #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) @@ -164,7 +167,8 @@ extern int vfp_restore_user_hwstate(stru * Change these and you break ASM code in entry-common.S */ #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ - _TIF_NOTIFY_RESUME | _TIF_UPROBE) + _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ + _TIF_NEED_RESCHED_LAZY) #endif /* __KERNEL__ */ #endif /* __ASM_ARM_THREAD_INFO_H */ Index: linux-5.4.290-rt95/arch/arm/kernel/asm-offsets.c =================================================================== --- linux-5.4.290-rt95.orig/arch/arm/kernel/asm-offsets.c +++ linux-5.4.290-rt95/arch/arm/kernel/asm-offsets.c @@ -55,6 +55,7 @@ int main(void) BLANK(); DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); + DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count)); DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); DEFINE(TI_TASK, offsetof(struct thread_info, task)); DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); Index: linux-5.4.290-rt95/arch/arm/kernel/entry-armv.S =================================================================== --- linux-5.4.290-rt95.orig/arch/arm/kernel/entry-armv.S +++ linux-5.4.290-rt95/arch/arm/kernel/entry-armv.S @@ -204,13 +204,20 @@ __irq_svc: svc_entry irq_handler -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION ldr r8, [tsk, #TI_PREEMPT] @ get preempt count - ldr r0, [tsk, #TI_FLAGS] @ get flags teq r8, #0 @ if preempt count != 0 + bne 1f @ return from exeption + ldr r0, [tsk, #TI_FLAGS] @ get flags + tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set + blne svc_preempt @ preempt! + + ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count + teq r8, #0 @ if preempt lazy count != 0 movne r0, #0 @ force flags to 0 - tst r0, #_TIF_NEED_RESCHED + tst r0, #_TIF_NEED_RESCHED_LAZY blne svc_preempt +1: #endif svc_exit r5, irq = 1 @ return from exception @@ -219,14 +226,20 @@ ENDPROC(__irq_svc) .ltorg -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION svc_preempt: mov r8, lr 1: bl preempt_schedule_irq @ irq en/disable is done inside ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS tst r0, #_TIF_NEED_RESCHED + bne 1b + tst r0, #_TIF_NEED_RESCHED_LAZY reteq r8 @ go again - b 1b + ldr r0, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count + teq r0, #0 @ if preempt lazy count != 0 + beq 1b + ret r8 @ go again + #endif __und_fault: Index: linux-5.4.290-rt95/arch/arm/kernel/entry-common.S =================================================================== --- linux-5.4.290-rt95.orig/arch/arm/kernel/entry-common.S +++ linux-5.4.290-rt95/arch/arm/kernel/entry-common.S @@ -53,7 +53,9 @@ __ret_fast_syscall: cmp r2, #TASK_SIZE blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing - tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK + tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP) + bne fast_work_pending + tst r1, #_TIF_SECCOMP bne fast_work_pending @@ -90,8 +92,11 @@ __ret_fast_syscall: cmp r2, #TASK_SIZE blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing - tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK + tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP) + bne do_slower_path + tst r1, #_TIF_SECCOMP beq no_work_pending +do_slower_path: UNWIND(.fnend ) ENDPROC(ret_fast_syscall) Index: linux-5.4.290-rt95/arch/arm/kernel/signal.c =================================================================== --- linux-5.4.290-rt95.orig/arch/arm/kernel/signal.c +++ linux-5.4.290-rt95/arch/arm/kernel/signal.c @@ -649,7 +649,8 @@ do_work_pending(struct pt_regs *regs, un */ trace_hardirqs_off(); do { - if (likely(thread_flags & _TIF_NEED_RESCHED)) { + if (likely(thread_flags & (_TIF_NEED_RESCHED | + _TIF_NEED_RESCHED_LAZY))) { schedule(); } else { if (unlikely(!user_mode(regs))) Index: linux-5.4.290-rt95/arch/arm/kernel/smp.c =================================================================== --- linux-5.4.290-rt95.orig/arch/arm/kernel/smp.c +++ linux-5.4.290-rt95/arch/arm/kernel/smp.c @@ -682,11 +682,9 @@ void handle_IPI(int ipinr, struct pt_reg break; case IPI_CPU_BACKTRACE: - printk_nmi_enter(); irq_enter(); nmi_cpu_backtrace(regs); irq_exit(); - printk_nmi_exit(); break; default: Index: linux-5.4.290-rt95/arch/arm/kernel/traps.c =================================================================== --- linux-5.4.290-rt95.orig/arch/arm/kernel/traps.c +++ linux-5.4.290-rt95/arch/arm/kernel/traps.c @@ -251,6 +251,8 @@ void show_stack(struct task_struct *tsk, #ifdef CONFIG_PREEMPT #define S_PREEMPT " PREEMPT" +#elif defined(CONFIG_PREEMPT_RT) +#define S_PREEMPT " PREEMPT_RT" #else #define S_PREEMPT "" #endif Index: linux-5.4.290-rt95/arch/arm/mm/cache-v7.S =================================================================== --- linux-5.4.290-rt95.orig/arch/arm/mm/cache-v7.S +++ linux-5.4.290-rt95/arch/arm/mm/cache-v7.S @@ -135,13 +135,13 @@ flush_levels: and r1, r1, #7 @ mask of the bits for current cache only cmp r1, #2 @ see what cache we have at this level blt skip @ skip if no cache, or just i-cache -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION save_and_disable_irqs_notrace r9 @ make cssr&csidr read atomic #endif mcr p15, 2, r10, c0, c0, 0 @ select current cache level in cssr isb @ isb to sych the new cssr&csidr mrc p15, 1, r1, c0, c0, 0 @ read the new csidr -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION restore_irqs_notrace r9 #endif and r2, r1, #7 @ extract the length of the cache lines Index: linux-5.4.290-rt95/arch/arm/mm/cache-v7m.S =================================================================== --- linux-5.4.290-rt95.orig/arch/arm/mm/cache-v7m.S +++ linux-5.4.290-rt95/arch/arm/mm/cache-v7m.S @@ -183,13 +183,13 @@ flush_levels: and r1, r1, #7 @ mask of the bits for current cache only cmp r1, #2 @ see what cache we have at this level blt skip @ skip if no cache, or just i-cache -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION save_and_disable_irqs_notrace r9 @ make cssr&csidr read atomic #endif write_csselr r10, r1 @ set current cache level isb @ isb to sych the new cssr&csidr read_ccsidr r1 @ read the new csidr -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION restore_irqs_notrace r9 #endif and r2, r1, #7 @ extract the length of the cache lines Index: linux-5.4.290-rt95/arch/arm/mm/fault.c =================================================================== --- linux-5.4.290-rt95.orig/arch/arm/mm/fault.c +++ linux-5.4.290-rt95/arch/arm/mm/fault.c @@ -414,6 +414,9 @@ do_translation_fault(unsigned long addr, if (addr < TASK_SIZE) return do_page_fault(addr, fsr, regs); + if (interrupts_enabled(regs)) + local_irq_enable(); + if (user_mode(regs)) goto bad_area; @@ -481,6 +484,9 @@ do_translation_fault(unsigned long addr, static int do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { + if (interrupts_enabled(regs)) + local_irq_enable(); + do_bad_area(addr, fsr, regs); return 0; } Index: linux-5.4.290-rt95/arch/arm/mm/highmem.c =================================================================== --- linux-5.4.290-rt95.orig/arch/arm/mm/highmem.c +++ linux-5.4.290-rt95/arch/arm/mm/highmem.c @@ -31,6 +31,11 @@ static inline pte_t get_fixmap_pte(unsig return *ptep; } +static unsigned int fixmap_idx(int type) +{ + return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); +} + void *kmap(struct page *page) { might_sleep(); @@ -51,12 +56,13 @@ EXPORT_SYMBOL(kunmap); void *kmap_atomic(struct page *page) { + pte_t pte = mk_pte(page, kmap_prot); unsigned int idx; unsigned long vaddr; void *kmap; int type; - preempt_disable(); + preempt_disable_nort(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); @@ -76,7 +82,7 @@ void *kmap_atomic(struct page *page) type = kmap_atomic_idx_push(); - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); + idx = fixmap_idx(type); vaddr = __fix_to_virt(idx); #ifdef CONFIG_DEBUG_HIGHMEM /* @@ -90,7 +96,10 @@ void *kmap_atomic(struct page *page) * in place, so the contained TLB flush ensures the TLB is updated * with the new mapping. */ - set_fixmap_pte(idx, mk_pte(page, kmap_prot)); +#ifdef CONFIG_PREEMPT_RT + current->kmap_pte[type] = pte; +#endif + set_fixmap_pte(idx, pte); return (void *)vaddr; } @@ -103,44 +112,75 @@ void __kunmap_atomic(void *kvaddr) if (kvaddr >= (void *)FIXADDR_START) { type = kmap_atomic_idx(); - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); + idx = fixmap_idx(type); if (cache_is_vivt()) __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE); +#ifdef CONFIG_PREEMPT_RT + current->kmap_pte[type] = __pte(0); +#endif #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(vaddr != __fix_to_virt(idx)); - set_fixmap_pte(idx, __pte(0)); #else (void) idx; /* to kill a warning */ #endif + set_fixmap_pte(idx, __pte(0)); kmap_atomic_idx_pop(); } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { /* this address was obtained through kmap_high_get() */ kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); } pagefault_enable(); - preempt_enable(); + preempt_enable_nort(); } EXPORT_SYMBOL(__kunmap_atomic); void *kmap_atomic_pfn(unsigned long pfn) { + pte_t pte = pfn_pte(pfn, kmap_prot); unsigned long vaddr; int idx, type; struct page *page = pfn_to_page(pfn); - preempt_disable(); + preempt_disable_nort(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); type = kmap_atomic_idx_push(); - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); + idx = fixmap_idx(type); vaddr = __fix_to_virt(idx); #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(get_fixmap_pte(vaddr))); #endif - set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot)); +#ifdef CONFIG_PREEMPT_RT + current->kmap_pte[type] = pte; +#endif + set_fixmap_pte(idx, pte); return (void *)vaddr; } +#if defined CONFIG_PREEMPT_RT +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) +{ + int i; + + /* + * Clear @prev's kmap_atomic mappings + */ + for (i = 0; i < prev_p->kmap_idx; i++) { + int idx = fixmap_idx(i); + + set_fixmap_pte(idx, __pte(0)); + } + /* + * Restore @next_p's kmap_atomic mappings + */ + for (i = 0; i < next_p->kmap_idx; i++) { + int idx = fixmap_idx(i); + + if (!pte_none(next_p->kmap_pte[i])) + set_fixmap_pte(idx, next_p->kmap_pte[i]); + } +} +#endif Index: linux-5.4.290-rt95/arch/arm64/Kconfig =================================================================== --- linux-5.4.290-rt95.orig/arch/arm64/Kconfig +++ linux-5.4.290-rt95/arch/arm64/Kconfig @@ -35,32 +35,32 @@ config ARM64 select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAVE_NMI_SAFE_CMPXCHG - select ARCH_INLINE_READ_LOCK if !PREEMPT - select ARCH_INLINE_READ_LOCK_BH if !PREEMPT - select ARCH_INLINE_READ_LOCK_IRQ if !PREEMPT - select ARCH_INLINE_READ_LOCK_IRQSAVE if !PREEMPT - select ARCH_INLINE_READ_UNLOCK if !PREEMPT - select ARCH_INLINE_READ_UNLOCK_BH if !PREEMPT - select ARCH_INLINE_READ_UNLOCK_IRQ if !PREEMPT - select ARCH_INLINE_READ_UNLOCK_IRQRESTORE if !PREEMPT - select ARCH_INLINE_WRITE_LOCK if !PREEMPT - select ARCH_INLINE_WRITE_LOCK_BH if !PREEMPT - select ARCH_INLINE_WRITE_LOCK_IRQ if !PREEMPT - select ARCH_INLINE_WRITE_LOCK_IRQSAVE if !PREEMPT - select ARCH_INLINE_WRITE_UNLOCK if !PREEMPT - select ARCH_INLINE_WRITE_UNLOCK_BH if !PREEMPT - select ARCH_INLINE_WRITE_UNLOCK_IRQ if !PREEMPT - select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE if !PREEMPT - select ARCH_INLINE_SPIN_TRYLOCK if !PREEMPT - select ARCH_INLINE_SPIN_TRYLOCK_BH if !PREEMPT - select ARCH_INLINE_SPIN_LOCK if !PREEMPT - select ARCH_INLINE_SPIN_LOCK_BH if !PREEMPT - select ARCH_INLINE_SPIN_LOCK_IRQ if !PREEMPT - select ARCH_INLINE_SPIN_LOCK_IRQSAVE if !PREEMPT - select ARCH_INLINE_SPIN_UNLOCK if !PREEMPT - select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPT - select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPT - select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPT + select ARCH_INLINE_READ_LOCK if !PREEMPTION + select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION + select ARCH_INLINE_READ_LOCK_IRQ if !PREEMPTION + select ARCH_INLINE_READ_LOCK_IRQSAVE if !PREEMPTION + select ARCH_INLINE_READ_UNLOCK if !PREEMPTION + select ARCH_INLINE_READ_UNLOCK_BH if !PREEMPTION + select ARCH_INLINE_READ_UNLOCK_IRQ if !PREEMPTION + select ARCH_INLINE_READ_UNLOCK_IRQRESTORE if !PREEMPTION + select ARCH_INLINE_WRITE_LOCK if !PREEMPTION + select ARCH_INLINE_WRITE_LOCK_BH if !PREEMPTION + select ARCH_INLINE_WRITE_LOCK_IRQ if !PREEMPTION + select ARCH_INLINE_WRITE_LOCK_IRQSAVE if !PREEMPTION + select ARCH_INLINE_WRITE_UNLOCK if !PREEMPTION + select ARCH_INLINE_WRITE_UNLOCK_BH if !PREEMPTION + select ARCH_INLINE_WRITE_UNLOCK_IRQ if !PREEMPTION + select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE if !PREEMPTION + select ARCH_INLINE_SPIN_TRYLOCK if !PREEMPTION + select ARCH_INLINE_SPIN_TRYLOCK_BH if !PREEMPTION + select ARCH_INLINE_SPIN_LOCK if !PREEMPTION + select ARCH_INLINE_SPIN_LOCK_BH if !PREEMPTION + select ARCH_INLINE_SPIN_LOCK_IRQ if !PREEMPTION + select ARCH_INLINE_SPIN_LOCK_IRQSAVE if !PREEMPTION + select ARCH_INLINE_SPIN_UNLOCK if !PREEMPTION + select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPTION + select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION + select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION select ARCH_KEEP_MEMBLOCK select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_QUEUED_RWLOCKS @@ -69,6 +69,7 @@ config ARM64 select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_INT128 if GCC_VERSION >= 50000 || CC_IS_CLANG select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_RT select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_FRAME_POINTERS @@ -159,6 +160,7 @@ config ARM64 select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_LAZY select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_RCU_TABLE_FREE Index: linux-5.4.290-rt95/arch/arm64/crypto/sha256-glue.c =================================================================== --- linux-5.4.290-rt95.orig/arch/arm64/crypto/sha256-glue.c +++ linux-5.4.290-rt95/arch/arm64/crypto/sha256-glue.c @@ -97,7 +97,7 @@ static int sha256_update_neon(struct sha * input when running on a preemptible kernel, but process the * data block by block instead. */ - if (IS_ENABLED(CONFIG_PREEMPT) && + if (IS_ENABLED(CONFIG_PREEMPTION) && chunk + sctx->count % SHA256_BLOCK_SIZE > SHA256_BLOCK_SIZE) chunk = SHA256_BLOCK_SIZE - sctx->count % SHA256_BLOCK_SIZE; Index: linux-5.4.290-rt95/arch/arm64/include/asm/assembler.h =================================================================== --- linux-5.4.290-rt95.orig/arch/arm64/include/asm/assembler.h +++ linux-5.4.290-rt95/arch/arm64/include/asm/assembler.h @@ -707,8 +707,8 @@ USER(\label, ic ivau, \tmp2) // invali * where