diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst index d2c4c27e1702..d83c9ab49427 100644 --- a/Documentation/dev-tools/kcov.rst +++ b/Documentation/dev-tools/kcov.rst @@ -50,6 +50,7 @@ The following program demonstrates coverage collection from within a test #include #include #include + #include #define KCOV_INIT_TRACE _IOR('c', 1, unsigned long) #define KCOV_ENABLE _IO('c', 100) @@ -177,6 +178,8 @@ Comparison operands collection /* Read number of comparisons collected. */ n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED); for (i = 0; i < n; i++) { + uint64_t ip; + type = cover[i * KCOV_WORDS_PER_CMP + 1]; /* arg1 and arg2 - operands of the comparison. */ arg1 = cover[i * KCOV_WORDS_PER_CMP + 2]; @@ -251,6 +254,8 @@ selectively from different subsystems. .. code-block:: c + /* Same includes and defines as above. */ + struct kcov_remote_arg { __u32 trace_mode; __u32 area_size; diff --git a/arch/alpha/include/asm/spinlock_types.h b/arch/alpha/include/asm/spinlock_types.h index 1d5716bc060b..2526fd3be5fd 100644 --- a/arch/alpha/include/asm/spinlock_types.h +++ b/arch/alpha/include/asm/spinlock_types.h @@ -2,7 +2,7 @@ #ifndef _ALPHA_SPINLOCK_TYPES_H #define _ALPHA_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 4ebd512043be..5ac2009727bd 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -32,6 +32,7 @@ config ARM select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE + select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_MEMTEST @@ -68,7 +69,7 @@ config ARM select HARDIRQS_SW_RESEND select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 - select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU + select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL select HAVE_ARCH_MMAP_RND_BITS if MMU @@ -109,6 +110,7 @@ config ARM select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_LAZY select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RSEQ @@ -124,6 +126,7 @@ config ARM select OLD_SIGSUSPEND3 select PCI_SYSCALL if PCI select PERF_USE_VMALLOC + select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM select RTC_LIB select SYS_SUPPORTS_APM_EMULATION select TRACE_IRQFLAGS_SUPPORT if !CPU_V7M diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h index 5976958647fe..0c14b36ef101 100644 --- a/arch/arm/include/asm/spinlock_types.h +++ b/arch/arm/include/asm/spinlock_types.h @@ -2,7 +2,7 @@ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 9a18da3e10cc..2fa63d96a4f0 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -52,6 +52,7 @@ struct cpu_context_save { struct thread_info { unsigned long flags; /* low level flags */ int preempt_count; /* 0 => preemptable, <0 => bug */ + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ struct task_struct *task; /* main task structure */ __u32 cpu; /* cpu */ __u32 cpu_domain; /* cpu domain */ @@ -134,6 +135,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ #define TIF_SECCOMP 7 /* seccomp syscall filtering active */ #define TIF_NOTIFY_SIGNAL 8 /* signal notifications exist */ +#define TIF_NEED_RESCHED_LAZY 9 #define TIF_USING_IWMMXT 17 #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ @@ -148,6 +150,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) /* Checks for any syscall work in entry-common.S */ @@ -157,7 +160,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, /* * Change these and you break ASM code in entry-common.S */ -#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ +#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ + _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ _TIF_NOTIFY_SIGNAL) diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index a646a3f6440f..beb09d74684f 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -43,6 +43,7 @@ int main(void) BLANK(); DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); + DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count)); DEFINE(TI_TASK, offsetof(struct thread_info, task)); DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); DEFINE(TI_CPU_DOMAIN, offsetof(struct thread_info, cpu_domain)); diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 68261a83b7ad..fa7d110ce555 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -206,11 +206,18 @@ ENDPROC(__dabt_svc) #ifdef CONFIG_PREEMPTION ldr r8, [tsk, #TI_PREEMPT] @ get preempt count - ldr r0, [tsk, #TI_FLAGS] @ get flags teq r8, #0 @ if preempt count != 0 + bne 1f @ return from exeption + ldr r0, [tsk, #TI_FLAGS] @ get flags + tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set + blne svc_preempt @ preempt! + + ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count + teq r8, #0 @ if preempt lazy count != 0 movne r0, #0 @ force flags to 0 - tst r0, #_TIF_NEED_RESCHED + tst r0, #_TIF_NEED_RESCHED_LAZY blne svc_preempt +1: #endif svc_exit r5, irq = 1 @ return from exception @@ -225,8 +232,14 @@ ENDPROC(__irq_svc) 1: bl preempt_schedule_irq @ irq en/disable is done inside ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS tst r0, #_TIF_NEED_RESCHED + bne 1b + tst r0, #_TIF_NEED_RESCHED_LAZY reteq r8 @ go again - b 1b + ldr r0, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count + teq r0, #0 @ if preempt lazy count != 0 + beq 1b + ret r8 @ go again + #endif __und_fault: diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 539897ac2828..4655f04ccdcd 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -607,7 +607,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) */ trace_hardirqs_off(); do { - if (likely(thread_flags & _TIF_NEED_RESCHED)) { + if (likely(thread_flags & (_TIF_NEED_RESCHED | + _TIF_NEED_RESCHED_LAZY))) { schedule(); } else { if (unlikely(!user_mode(regs))) diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 842427ff2b3c..b943e2df9540 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -667,9 +667,7 @@ static void do_handle_IPI(int ipinr) break; case IPI_CPU_BACKTRACE: - printk_deferred_enter(); nmi_cpu_backtrace(get_irq_regs()); - printk_deferred_exit(); break; default: diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index efa402025031..59487ee9fd61 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -400,6 +400,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, if (addr < TASK_SIZE) return do_page_fault(addr, fsr, regs); + if (interrupts_enabled(regs)) + local_irq_enable(); + if (user_mode(regs)) goto bad_area; @@ -470,6 +473,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, static int do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { + if (interrupts_enabled(regs)) + local_irq_enable(); + do_bad_area(addr, fsr, regs); return 0; } diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 8b6f090e0364..784c90ba371e 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -88,6 +88,7 @@ config ARM64 select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT @@ -191,6 +192,7 @@ config ARM64 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_PREEMPT_LAZY select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUTEX_CMPXCHG if FUTEX select MMU_GATHER_RCU_TABLE_FREE @@ -212,6 +214,7 @@ config ARM64 select PCI_DOMAINS_GENERIC if PCI select PCI_ECAM if (ACPI && PCI) select PCI_SYSCALL if PCI + select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM select POWER_RESET select POWER_SUPPLY select SPARSE_IRQ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index ed57717cd004..63b39229890b 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1001,7 +1001,7 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, */ static inline bool arch_faults_on_old_pte(void) { - WARN_ON(preemptible()); + WARN_ON(is_migratable()); return !cpu_has_hw_af(); } diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h index e83f0982b99c..2545c17281e1 100644 --- a/arch/arm64/include/asm/preempt.h +++ b/arch/arm64/include/asm/preempt.h @@ -70,13 +70,36 @@ static inline bool __preempt_count_dec_and_test(void) * interrupt occurring between the non-atomic READ_ONCE/WRITE_ONCE * pair. */ - return !pc || !READ_ONCE(ti->preempt_count); + if (!pc || !READ_ONCE(ti->preempt_count)) + return true; +#ifdef CONFIG_PREEMPT_LAZY + if ((pc & ~PREEMPT_NEED_RESCHED)) + return false; + if (current_thread_info()->preempt_lazy_count) + return false; + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else + return false; +#endif } static inline bool should_resched(int preempt_offset) { +#ifdef CONFIG_PREEMPT_LAZY + u64 pc = READ_ONCE(current_thread_info()->preempt_count); + if (pc == preempt_offset) + return true; + + if ((pc & ~PREEMPT_NEED_RESCHED) != preempt_offset) + return false; + + if (current_thread_info()->preempt_lazy_count) + return false; + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else u64 pc = READ_ONCE(current_thread_info()->preempt_count); return pc == preempt_offset; +#endif } #ifdef CONFIG_PREEMPTION diff --git a/arch/arm64/include/asm/signal.h b/arch/arm64/include/asm/signal.h index ef449f5f4ba8..5e535c3e4926 100644 --- a/arch/arm64/include/asm/signal.h +++ b/arch/arm64/include/asm/signal.h @@ -22,4 +22,8 @@ static inline void __user *arch_untagged_si_addr(void __user *addr, } #define arch_untagged_si_addr arch_untagged_si_addr +#if defined(CONFIG_PREEMPT_RT) +#define ARCH_RT_DELAYS_SIGNAL_SEND +#endif + #endif diff --git a/arch/arm64/include/asm/spinlock_types.h b/arch/arm64/include/asm/spinlock_types.h index 18782f0c4721..11ab1c077697 100644 --- a/arch/arm64/include/asm/spinlock_types.h +++ b/arch/arm64/include/asm/spinlock_types.h @@ -5,7 +5,7 @@ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H -#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H) +#if !defined(__LINUX_SPINLOCK_TYPES_RAW_H) && !defined(__ASM_SPINLOCK_H) # error "please don't include this file directly" #endif diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 6623c99f0984..c55ccec33a5a 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -26,6 +26,7 @@ struct thread_info { #ifdef CONFIG_ARM64_SW_TTBR0_PAN u64 ttbr0; /* saved TTBR0_EL1 */ #endif + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ union { u64 preempt_count; /* 0 => preemptible, <0 => bug */ struct { @@ -67,6 +68,7 @@ int arch_dup_task_struct(struct task_struct *dst, #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ #define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */ #define TIF_NOTIFY_SIGNAL 6 /* signal notifications exist */ +#define TIF_NEED_RESCHED_LAZY 7 #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ @@ -97,8 +99,10 @@ int arch_dup_task_struct(struct task_struct *dst, #define _TIF_SVE (1 << TIF_SVE) #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) -#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ +#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ + _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \ _TIF_NOTIFY_SIGNAL) @@ -107,6 +111,8 @@ int arch_dup_task_struct(struct task_struct *dst, _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ _TIF_SYSCALL_EMU) +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) + #ifdef CONFIG_SHADOW_CALL_STACK #define INIT_SCS \ .scs_base = init_shadow_call_stack, \ diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 551427ae8cc5..96a4f6c9eb78 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -31,6 +31,7 @@ int main(void) BLANK(); DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); + DEFINE(TSK_TI_PREEMPT_LAZY, offsetof(struct task_struct, thread_info.preempt_lazy_count)); #ifdef CONFIG_ARM64_SW_TTBR0_PAN DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); #endif diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index ff4962750b3d..99484e8bbade 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -179,10 +179,19 @@ static void __get_cpu_fpsimd_context(void) * * The double-underscore version must only be called if you know the task * can't be preempted. + * + * On RT kernels local_bh_disable() is not sufficient because it only + * serializes soft interrupt related sections via a local lock, but stays + * preemptible. Disabling preemption is the right choice here as bottom + * half processing is always in thread context on RT kernels so it + * implicitly prevents bottom half processing as well. */ static void get_cpu_fpsimd_context(void) { - local_bh_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_disable(); + else + preempt_disable(); __get_cpu_fpsimd_context(); } @@ -203,7 +212,10 @@ static void __put_cpu_fpsimd_context(void) static void put_cpu_fpsimd_context(void) { __put_cpu_fpsimd_context(); - local_bh_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_enable(); + else + preempt_enable(); } static bool have_cpu_fpsimd_context(void) @@ -1033,6 +1045,7 @@ void fpsimd_thread_switch(struct task_struct *next) void fpsimd_flush_thread(void) { int vl, supported_vl; + void *sve_state = NULL; if (!system_supports_fpsimd()) return; @@ -1045,7 +1058,10 @@ void fpsimd_flush_thread(void) if (system_supports_sve()) { clear_thread_flag(TIF_SVE); - sve_free(current); + + /* Defer kfree() while in atomic context */ + sve_state = current->thread.sve_state; + current->thread.sve_state = NULL; /* * Reset the task vector length as required. @@ -1079,6 +1095,7 @@ void fpsimd_flush_thread(void) } put_cpu_fpsimd_context(); + kfree(sve_state); } /* diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index b3e1beccf458..03183563feb8 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -922,7 +922,7 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) { do { - if (thread_flags & _TIF_NEED_RESCHED) { + if (thread_flags & _TIF_NEED_RESCHED_MASK) { /* Unmask Debug and SError for the next task */ local_daif_restore(DAIF_PROCCTX_NOIRQ); @@ -930,6 +930,14 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) } else { local_daif_restore(DAIF_PROCCTX); +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND + if (unlikely(current->forced_info.si_signo)) { + struct task_struct *t = current; + force_sig_info(&t->forced_info); + t->forced_info.si_signo = 0; + } +#endif + if (thread_flags & _TIF_UPROBE) uprobe_notify_resume(regs); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a30c036577a3..dbb269b8407b 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -829,7 +829,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * involves poking the GIC, which must be done in a * non-preemptible context. */ - preempt_disable(); + migrate_disable(); kvm_pmu_flush_hwstate(vcpu); @@ -853,7 +853,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_timer_sync_user(vcpu); kvm_vgic_sync_hwstate(vcpu); local_irq_enable(); - preempt_enable(); + migrate_enable(); continue; } @@ -922,7 +922,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* Exit types that need handling before we can be preempted */ handle_exit_early(vcpu, ret); - preempt_enable(); + migrate_enable(); /* * The ARMv8 architecture doesn't give the hypervisor diff --git a/arch/csky/include/asm/spinlock_types.h b/arch/csky/include/asm/spinlock_types.h index 8ff0f6ff3a00..db87a12c3827 100644 --- a/arch/csky/include/asm/spinlock_types.h +++ b/arch/csky/include/asm/spinlock_types.h @@ -3,7 +3,7 @@ #ifndef __ASM_CSKY_SPINLOCK_TYPES_H #define __ASM_CSKY_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif diff --git a/arch/hexagon/include/asm/spinlock_types.h b/arch/hexagon/include/asm/spinlock_types.h index 19d233497ba5..d5f66495b670 100644 --- a/arch/hexagon/include/asm/spinlock_types.h +++ b/arch/hexagon/include/asm/spinlock_types.h @@ -8,7 +8,7 @@ #ifndef _ASM_SPINLOCK_TYPES_H #define _ASM_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif diff --git a/arch/ia64/include/asm/spinlock_types.h b/arch/ia64/include/asm/spinlock_types.h index 6e345fefcdca..14b8a161c165 100644 --- a/arch/ia64/include/asm/spinlock_types.h +++ b/arch/ia64/include/asm/spinlock_types.h @@ -2,7 +2,7 @@ #ifndef _ASM_IA64_SPINLOCK_TYPES_H #define _ASM_IA64_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 941618a7f7ff..a7bc2810edb6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -151,6 +151,7 @@ config PPC select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x + select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_USE_MEMTEST @@ -218,6 +219,7 @@ config PPC select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx) select HAVE_IOREMAP_PROT select HAVE_IRQ_TIME_ACCOUNTING + select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZMA if DEFAULT_UIMAGE select HAVE_KERNEL_LZO if DEFAULT_UIMAGE @@ -234,6 +236,7 @@ config PPC select HAVE_PERF_EVENTS_NMI if PPC64 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_LAZY select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE select HAVE_RSEQ diff --git a/arch/powerpc/include/asm/simple_spinlock_types.h b/arch/powerpc/include/asm/simple_spinlock_types.h index 0f3cdd8faa95..08243338069d 100644 --- a/arch/powerpc/include/asm/simple_spinlock_types.h +++ b/arch/powerpc/include/asm/simple_spinlock_types.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H #define _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 7ef1cd8168a0..f9e63cacd220 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -62,6 +62,7 @@ struct smp_ops_t { extern int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us); extern int smp_send_safe_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us); +extern void smp_send_debugger_break_cpu(unsigned int cpu); extern void smp_send_debugger_break(void); extern void start_secondary_resume(void); extern void smp_generic_give_timebase(void); diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h index c5d742f18021..d5f8a74ed2e8 100644 --- a/arch/powerpc/include/asm/spinlock_types.h +++ b/arch/powerpc/include/asm/spinlock_types.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_SPINLOCK_TYPES_H #define _ASM_POWERPC_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h index 1c8460e23583..b1653c160bab 100644 --- a/arch/powerpc/include/asm/stackprotector.h +++ b/arch/powerpc/include/asm/stackprotector.h @@ -24,7 +24,11 @@ static __always_inline void boot_init_stack_canary(void) unsigned long canary; /* Try to get a semi random initial value. */ +#ifdef CONFIG_PREEMPT_RT + canary = (unsigned long)&canary; +#else canary = get_random_canary(); +#endif canary ^= mftb(); canary ^= LINUX_VERSION_CODE; canary &= CANARY_MASK; diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 87013ac2a640..2920ed371188 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -53,6 +53,8 @@ struct thread_info { int preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_lazy_count; /* 0 => preemptable, + <0 => BUG */ unsigned long local_flags; /* private flags for thread */ #ifdef CONFIG_LIVEPATCH unsigned long *livepatch_sp; @@ -99,6 +101,7 @@ void arch_setup_new_exec(void); #define TIF_PATCH_PENDING 6 /* pending live patching update */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SINGLESTEP 8 /* singlestepping active */ +#define TIF_NEED_RESCHED_LAZY 9 /* lazy rescheduling necessary */ #define TIF_SECCOMP 10 /* secure computing */ #define TIF_RESTOREALL 11 /* Restore all regs (implies NOERROR) */ #define TIF_NOERROR 12 /* Force successful syscall return */ @@ -114,6 +117,7 @@ void arch_setup_new_exec(void); #define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_32BIT 20 /* 32 bit binary */ + /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1<flags); while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { local_irq_enable(); - if (ti_flags & _TIF_NEED_RESCHED) { + if (ti_flags & _TIF_NEED_RESCHED_MASK) { schedule(); } else { /* @@ -552,11 +552,15 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) /* Returning to a kernel context with local irqs enabled. */ WARN_ON_ONCE(!(regs->msr & MSR_EE)); again: - if (IS_ENABLED(CONFIG_PREEMPT)) { + if (IS_ENABLED(CONFIG_PREEMPTION)) { /* Return to preemptible kernel context */ if (unlikely(current_thread_info()->flags & _TIF_NEED_RESCHED)) { if (preempt_count() == 0) preempt_schedule_irq(); + } else if (unlikely(current_thread_info()->flags & _TIF_NEED_RESCHED_LAZY)) { + if ((preempt_count() == 0) && + (current_thread_info()->preempt_lazy_count == 0)) + preempt_schedule_irq(); } } diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index c4f1d6b7d992..02e17a57da83 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -690,6 +690,7 @@ static inline void check_stack_overflow(void) } } +#ifndef CONFIG_PREEMPT_RT static __always_inline void call_do_softirq(const void *sp) { /* Temporarily switch r1 to sp, call __do_softirq() then restore r1. */ @@ -708,6 +709,7 @@ static __always_inline void call_do_softirq(const void *sp) "r11", "r12" ); } +#endif static __always_inline void call_do_irq(struct pt_regs *regs, void *sp) { @@ -820,10 +822,12 @@ void *mcheckirq_ctx[NR_CPUS] __read_mostly; void *softirq_ctx[NR_CPUS] __read_mostly; void *hardirq_ctx[NR_CPUS] __read_mostly; +#ifndef CONFIG_PREEMPT_RT void do_softirq_own_stack(void) { call_do_softirq(softirq_ctx[smp_processor_id()]); } +#endif irq_hw_number_t virq_to_hw(unsigned int virq) { diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index bdee7262c080..d57d37497862 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -120,11 +120,19 @@ int kgdb_skipexception(int exception, struct pt_regs *regs) static int kgdb_debugger_ipi(struct pt_regs *regs) { - kgdb_nmicallback(raw_smp_processor_id(), regs); + int cpu = raw_smp_processor_id(); + + if (!kgdb_roundup_delay(cpu)) + kgdb_nmicallback(cpu, regs); return 0; } #ifdef CONFIG_SMP +void kgdb_roundup_cpu(unsigned int cpu) +{ + smp_send_debugger_break_cpu(cpu); +} + void kgdb_roundup_cpus(void) { smp_send_debugger_break(); diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index fb95f92dcfac..308765f2e7a0 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -590,6 +590,11 @@ static void debugger_ipi_callback(struct pt_regs *regs) debugger_ipi(regs); } +void smp_send_debugger_break_cpu(unsigned int cpu) +{ + smp_send_nmi_ipi(cpu, debugger_ipi_callback, 1000000); +} + void smp_send_debugger_break(void) { smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, debugger_ipi_callback, 1000000); diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 11741703d26e..7e4e1f489f56 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -260,12 +260,17 @@ static char *get_mmu_str(void) static int __die(const char *str, struct pt_regs *regs, long err) { + const char *pr = ""; + printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); + if (IS_ENABLED(CONFIG_PREEMPTION)) + pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; + printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n", IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", PAGE_SIZE / 1024, get_mmu_str(), - IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", + pr, IS_ENABLED(CONFIG_SMP) ? " SMP" : "", IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c index 22ceeeb705ab..d5359701f787 100644 --- a/arch/powerpc/kexec/crash.c +++ b/arch/powerpc/kexec/crash.c @@ -312,9 +312,6 @@ void default_machine_crash_shutdown(struct pt_regs *regs) unsigned int i; int (*old_handler)(struct pt_regs *regs); - /* Avoid hardlocking with irresponsive CPU holding logbuf_lock */ - printk_deferred_enter(); - /* * This function is only called after the system * has panicked or is otherwise in a critical state. diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index ff581d70f20c..e5c84d55bdfb 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -178,6 +178,7 @@ config KVM_E500MC config KVM_MPIC bool "KVM in-kernel MPIC emulation" depends on KVM && E500 + depends on !PREEMPT_RT select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 8322ca86d5ac..f524145d7dd3 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -195,7 +196,13 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, return ret; } -static DEFINE_PER_CPU(__be64 *, tce_page); +struct tce_page { + __be64 * page; + local_lock_t lock; +}; +static DEFINE_PER_CPU(struct tce_page, tce_page) = { + .lock = INIT_LOCAL_LOCK(lock), +}; static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, @@ -218,9 +225,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, direction, attrs); } - local_irq_save(flags); /* to protect tcep and the page behind it */ + /* to protect tcep and the page behind it */ + local_lock_irqsave(&tce_page.lock, flags); - tcep = __this_cpu_read(tce_page); + tcep = __this_cpu_read(tce_page.page); /* This is safe to do since interrupts are off when we're called * from iommu_alloc{,_sg}() @@ -229,12 +237,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, tcep = (__be64 *)__get_free_page(GFP_ATOMIC); /* If allocation fails, fall back to the loop implementation */ if (!tcep) { - local_irq_restore(flags); + local_unlock_irqrestore(&tce_page.lock, flags); return tce_build_pSeriesLP(tbl->it_index, tcenum, tceshift, npages, uaddr, direction, attrs); } - __this_cpu_write(tce_page, tcep); + __this_cpu_write(tce_page.page, tcep); } rpn = __pa(uaddr) >> tceshift; @@ -264,7 +272,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, tcenum += limit; } while (npages > 0 && !rc); - local_irq_restore(flags); + local_unlock_irqrestore(&tce_page.lock, flags); if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { ret = (int)rc; @@ -440,16 +448,17 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, DMA_BIDIRECTIONAL, 0); } - local_irq_disable(); /* to protect tcep and the page behind it */ - tcep = __this_cpu_read(tce_page); + /* to protect tcep and the page behind it */ + local_lock_irq(&tce_page.lock); + tcep = __this_cpu_read(tce_page.page); if (!tcep) { tcep = (__be64 *)__get_free_page(GFP_ATOMIC); if (!tcep) { - local_irq_enable(); + local_unlock_irq(&tce_page.lock); return -ENOMEM; } - __this_cpu_write(tce_page, tcep); + __this_cpu_write(tce_page.page, tcep); } proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; @@ -492,7 +501,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, /* error cleanup: caller will clear whole range */ - local_irq_enable(); + local_unlock_irq(&tce_page.lock); return rc; } diff --git a/arch/riscv/include/asm/spinlock_types.h b/arch/riscv/include/asm/spinlock_types.h index f398e7638dd6..5a35a49505da 100644 --- a/arch/riscv/include/asm/spinlock_types.h +++ b/arch/riscv/include/asm/spinlock_types.h @@ -6,7 +6,7 @@ #ifndef _ASM_RISCV_SPINLOCK_TYPES_H #define _ASM_RISCV_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h index a2bbfd7df85f..b69695e39957 100644 --- a/arch/s390/include/asm/spinlock_types.h +++ b/arch/s390/include/asm/spinlock_types.h @@ -2,7 +2,7 @@ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif diff --git a/arch/sh/include/asm/spinlock_types.h b/arch/sh/include/asm/spinlock_types.h index e82369f286a2..907bda4b1619 100644 --- a/arch/sh/include/asm/spinlock_types.h +++ b/arch/sh/include/asm/spinlock_types.h @@ -2,7 +2,7 @@ #ifndef __ASM_SH_SPINLOCK_TYPES_H #define __ASM_SH_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c index ef0f0827cf57..2d3eca8fee01 100644 --- a/arch/sh/kernel/irq.c +++ b/arch/sh/kernel/irq.c @@ -149,6 +149,7 @@ void irq_ctx_exit(int cpu) hardirq_ctx[cpu] = NULL; } +#ifndef CONFIG_PREEMPT_RT void do_softirq_own_stack(void) { struct thread_info *curctx; @@ -176,6 +177,7 @@ void do_softirq_own_stack(void) "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr" ); } +#endif #else static inline void handle_one_irq(unsigned int irq) { diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index c8848bb681a1..41fa1be980a3 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -855,6 +855,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs) set_irq_regs(old_regs); } +#ifndef CONFIG_PREEMPT_RT void do_softirq_own_stack(void) { void *orig_sp, *sp = softirq_stack[smp_processor_id()]; @@ -869,6 +870,7 @@ void do_softirq_own_stack(void) __asm__ __volatile__("mov %0, %%sp" : : "r" (orig_sp)); } +#endif #ifdef CONFIG_HOTPLUG_CPU void fixup_irqs(void) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1d0f16b53393..ecf7aed3ba65 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -107,6 +107,7 @@ config X86 select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 select ARCH_SUPPORTS_LTO_CLANG select ARCH_SUPPORTS_LTO_CLANG_THIN + select ARCH_SUPPORTS_RT select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS @@ -230,6 +231,7 @@ config X86 select HAVE_PCI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_LAZY select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index e087cd7837c3..96cc92f63b06 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -202,6 +202,7 @@ IRQ_CONSTRAINTS, regs, vector); \ } +#ifndef CONFIG_PREEMPT_RT /* * Macro to invoke __do_softirq on the irq stack. This is only called from * task context when bottom halves are about to be reenabled and soft @@ -215,6 +216,8 @@ __this_cpu_write(hardirq_stack_inuse, false); \ } +#endif + #else /* CONFIG_X86_64 */ /* System vector handlers always run on the stack they interrupted. */ #define run_sysvec_on_irqstack_cond(func, regs) \ diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index fe5efbcba824..ab8cb5fc2329 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -90,17 +90,48 @@ static __always_inline void __preempt_count_sub(int val) * a decrement which hits zero means we have no preempt_count and should * reschedule. */ -static __always_inline bool __preempt_count_dec_and_test(void) +static __always_inline bool ____preempt_count_dec_and_test(void) { return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var])); } +static __always_inline bool __preempt_count_dec_and_test(void) +{ + if (____preempt_count_dec_and_test()) + return true; +#ifdef CONFIG_PREEMPT_LAZY + if (preempt_count()) + return false; + if (current_thread_info()->preempt_lazy_count) + return false; + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else + return false; +#endif +} + /* * Returns true when we need to resched and can (barring IRQ state). */ static __always_inline bool should_resched(int preempt_offset) { +#ifdef CONFIG_PREEMPT_LAZY + u32 tmp; + tmp = raw_cpu_read_4(__preempt_count); + if (tmp == preempt_offset) + return true; + + /* preempt count == 0 ? */ + tmp &= ~PREEMPT_NEED_RESCHED; + if (tmp != preempt_offset) + return false; + /* XXX PREEMPT_LOCK_OFFSET */ + if (current_thread_info()->preempt_lazy_count) + return false; + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); +#endif } #ifdef CONFIG_PREEMPTION diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 2dfb5fea13af..fc03f4f7ed84 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -28,6 +28,19 @@ typedef struct { #define SA_IA32_ABI 0x02000000u #define SA_X32_ABI 0x01000000u +/* + * Because some traps use the IST stack, we must keep preemption + * disabled while calling do_trap(), but do_trap() may call + * force_sig_info() which will grab the signal spin_locks for the + * task, which in PREEMPT_RT are mutexes. By defining + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the + * trap. + */ +#if defined(CONFIG_PREEMPT_RT) +#define ARCH_RT_DELAYS_SIGNAL_SEND +#endif + #ifndef CONFIG_COMPAT #define compat_sigset_t compat_sigset_t typedef sigset_t compat_sigset_t; diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 24a8d6c4fb18..2fc22c27df18 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -50,7 +50,7 @@ */ static __always_inline void boot_init_stack_canary(void) { - u64 canary; + u64 canary = 0; u64 tsc; #ifdef CONFIG_X86_64 @@ -61,8 +61,14 @@ static __always_inline void boot_init_stack_canary(void) * of randomness. The TSC only matters for very early init, * there it already has some randomness on most systems. Later * on during the bootup the random pool has true entropy too. + * For preempt-rt we need to weaken the randomness a bit, as + * we can't call into the random generator from atomic context + * due to locking constraints. We just leave canary + * uninitialized and use the TSC based randomness on top of it. */ +#ifndef CONFIG_PREEMPT_RT get_random_bytes(&canary, sizeof(canary)); +#endif tsc = rdtsc(); canary += tsc + (tsc << 32UL); canary &= CANARY_MASK; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index cf132663c219..75dc786e6365 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -57,11 +57,14 @@ struct thread_info { unsigned long flags; /* low level flags */ unsigned long syscall_work; /* SYSCALL_WORK_ flags */ u32 status; /* thread synchronous flags */ + int preempt_lazy_count; /* 0 => lazy preemptable + <0 => BUG */ }; #define INIT_THREAD_INFO(tsk) \ { \ .flags = 0, \ + .preempt_lazy_count = 0, \ } #else /* !__ASSEMBLY__ */ @@ -90,6 +93,7 @@ struct thread_info { #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */ #define TIF_SLD 18 /* Restore split lock detection on context switch */ +#define TIF_NEED_RESCHED_LAZY 19 /* lazy rescheduling necessary */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ @@ -114,6 +118,7 @@ struct thread_info { #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_SLD (1 << TIF_SLD) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 722fd712e1cf..82cc3a7be6bd 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -141,7 +141,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, */ if (visit_mask) { if (*visit_mask & (1UL << info->type)) { - printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); + pr_warn_once("WARNING: stack recursion on stack type %d\n", info->type); goto unknown; } *visit_mask |= 1UL << info->type; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 6c5defd6569a..5f725b0ceb29 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -207,7 +207,8 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, if (visit_mask) { if (*visit_mask & (1UL << info->type)) { if (task == current) - printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); + pr_warn_once("WARNING: stack recursion on stack type %d\n", + info->type); goto unknown; } *visit_mask |= 1UL << info->type; diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 15aefa3f3e18..52af9a89ad47 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -207,8 +207,7 @@ static void mask_and_ack_8259A(struct irq_data *data) * lets ACK and report it. [once per IRQ] */ if (!(spurious_irq_mask & irqmask)) { - printk_deferred(KERN_DEBUG - "spurious 8259A interrupt: IRQ%d.\n", irq); + printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); spurious_irq_mask |= irqmask; } atomic_inc(&irq_err_count); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 044902d5a3c4..e5dd6da78713 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -132,6 +132,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) return 0; } +#ifndef CONFIG_PREEMPT_RT void do_softirq_own_stack(void) { struct irq_stack *irqstk; @@ -148,6 +149,7 @@ void do_softirq_own_stack(void) call_on_stack(__do_softirq, isp); } +#endif void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) { diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 3a43a2dee658..37bd37cdf2b6 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -502,9 +502,12 @@ static int kgdb_nmi_handler(unsigned int cmd, struct pt_regs *regs) if (atomic_read(&kgdb_active) != -1) { /* KGDB CPU roundup */ cpu = raw_smp_processor_id(); - kgdb_nmicallback(cpu, regs); - set_bit(cpu, was_in_debug_nmi); - touch_nmi_watchdog(); + + if (!kgdb_roundup_delay(cpu)) { + kgdb_nmicallback(cpu, regs); + set_bit(cpu, was_in_debug_nmi); + touch_nmi_watchdog(); + } return NMI_HANDLED; } diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index d7c44b257f7f..2d0361cd304f 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -41,9 +41,9 @@ static void unwind_dump(struct unwind_state *state) dumped_before = true; - printk_deferred("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n", - state->stack_info.type, state->stack_info.next_sp, - state->stack_mask, state->graph_idx); + printk("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n", + state->stack_info.type, state->stack_info.next_sp, + state->stack_mask, state->graph_idx); for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp; sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { @@ -59,13 +59,11 @@ static void unwind_dump(struct unwind_state *state) if (zero) { if (!prev_zero) - printk_deferred("%p: %0*x ...\n", - sp, BITS_PER_LONG/4, 0); + printk("%p: %0*x ...\n", sp, BITS_PER_LONG/4, 0); continue; } - printk_deferred("%p: %0*lx (%pB)\n", - sp, BITS_PER_LONG/4, word, (void *)word); + printk("%p: %0*lx (%pB)\n", sp, BITS_PER_LONG/4, word, (void *)word); } } } @@ -342,13 +340,13 @@ bool unwind_next_frame(struct unwind_state *state) goto the_end; if (state->regs) { - printk_deferred_once(KERN_WARNING + pr_warn_once( "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", state->regs, state->task->comm, state->task->pid, next_bp); unwind_dump(state); } else { - printk_deferred_once(KERN_WARNING + pr_warn_once( "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n", state->bp, state->task->comm, state->task->pid, next_bp); diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index a1202536fc57..a26a7c3849f5 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -9,7 +9,7 @@ #include #define orc_warn(fmt, ...) \ - printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__) + pr_warn_once("WARNING: " fmt, ##__VA_ARGS__) #define orc_warn_current(args...) \ ({ \ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 39aaa21e28f7..37f76d0fe74a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8550,6 +8550,14 @@ int kvm_arch_init(void *opaque) goto out; } +#ifdef CONFIG_PREEMPT_RT + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n"); + r = -EOPNOTSUPP; + goto out; + } +#endif + r = -ENOMEM; x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu), __alignof__(struct fpu), SLAB_ACCOUNT, diff --git a/arch/xtensa/include/asm/spinlock_types.h b/arch/xtensa/include/asm/spinlock_types.h index 64c9389254f1..797aed7df3dd 100644 --- a/arch/xtensa/include/asm/spinlock_types.h +++ b/arch/xtensa/include/asm/spinlock_types.h @@ -2,7 +2,7 @@ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H -#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H) +#if !defined(__LINUX_SPINLOCK_TYPES_RAW_H) && !defined(__ASM_SPINLOCK_H) # error "please don't include this file directly" #endif diff --git a/block/blk-mq.c b/block/blk-mq.c index 95993c4efa49..2f173fea818c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1565,14 +1565,14 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, return; if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { - int cpu = get_cpu(); + int cpu = get_cpu_light(); if (cpumask_test_cpu(cpu, hctx->cpumask)) { __blk_mq_run_hw_queue(hctx); - put_cpu(); + put_cpu_light(); return; } - put_cpu(); + put_cpu_light(); } kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, diff --git a/crypto/cryptd.c b/crypto/cryptd.c index 668095eca0fa..32bdabe6767d 100644 --- a/crypto/cryptd.c +++ b/crypto/cryptd.c @@ -36,6 +36,7 @@ static struct workqueue_struct *cryptd_wq; struct cryptd_cpu_queue { struct crypto_queue queue; struct work_struct work; + spinlock_t qlock; }; struct cryptd_queue { @@ -109,6 +110,7 @@ static int cryptd_init_queue(struct cryptd_queue *queue, cpu_queue = per_cpu_ptr(queue->cpu_queue, cpu); crypto_init_queue(&cpu_queue->queue, max_cpu_qlen); INIT_WORK(&cpu_queue->work, cryptd_queue_worker); + spin_lock_init(&cpu_queue->qlock); } pr_info("cryptd: max_cpu_qlen set to %d\n", max_cpu_qlen); return 0; diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 70f69f0910c9..58eee8eab4bf 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -1061,14 +1061,14 @@ static void generate_random_testvec_config(struct testvec_config *cfg, static void crypto_disable_simd_for_test(void) { - preempt_disable(); + migrate_disable(); __this_cpu_write(crypto_simd_disabled_for_test, true); } static void crypto_reenable_simd_for_test(void) { __this_cpu_write(crypto_simd_disabled_for_test, false); - preempt_enable(); + migrate_enable(); } /* diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 6383c81ac5b3..abb695f5f5e4 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -59,6 +59,40 @@ static void zram_free_page(struct zram *zram, size_t index); static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, struct bio *bio); +#ifdef CONFIG_PREEMPT_RT +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) +{ + size_t index; + + for (index = 0; index < num_pages; index++) + spin_lock_init(&zram->table[index].lock); +} + +static int zram_slot_trylock(struct zram *zram, u32 index) +{ + int ret; + + ret = spin_trylock(&zram->table[index].lock); + if (ret) + __set_bit(ZRAM_LOCK, &zram->table[index].flags); + return ret; +} + +static void zram_slot_lock(struct zram *zram, u32 index) +{ + spin_lock(&zram->table[index].lock); + __set_bit(ZRAM_LOCK, &zram->table[index].flags); +} + +static void zram_slot_unlock(struct zram *zram, u32 index) +{ + __clear_bit(ZRAM_LOCK, &zram->table[index].flags); + spin_unlock(&zram->table[index].lock); +} + +#else + +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { } static int zram_slot_trylock(struct zram *zram, u32 index) { @@ -74,6 +108,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index) { bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); } +#endif static inline bool init_done(struct zram *zram) { @@ -1169,6 +1204,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) if (!huge_class_size) huge_class_size = zs_huge_class_size(zram->mem_pool); + zram_meta_init_table_locks(zram, num_pages); return true; } diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 80c3b43b4828..d8f6d880f915 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -63,6 +63,7 @@ struct zram_table_entry { unsigned long element; }; unsigned long flags; + spinlock_t lock; #ifdef CONFIG_ZRAM_MEMORY_TRACKING ktime_t ac_time; #endif diff --git a/drivers/char/random.c b/drivers/char/random.c index 7f6585c49380..321f7e2f4f57 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -183,7 +183,7 @@ static void __cold process_random_ready_list(void) #define warn_unseeded_randomness() \ if (IS_ENABLED(CONFIG_WARN_ALL_UNSEEDED_RANDOM) && !crng_ready()) \ - printk_deferred(KERN_NOTICE "random: %s called from %pS with crng_init=%d\n", \ + pr_notice("random: %s called from %pS with crng_init=%d\n", \ __func__, (void *)_RET_IP_, crng_init) diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c index d3f2e5364c27..9c4a99757afd 100644 --- a/drivers/char/tpm/tpm_tis.c +++ b/drivers/char/tpm/tpm_tis.c @@ -50,6 +50,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da return container_of(data, struct tpm_tis_tcg_phy, priv); } +#ifdef CONFIG_PREEMPT_RT +/* + * Flushes previous write operations to chip so that a subsequent + * ioread*()s won't stall a cpu. + */ +static inline void tpm_tis_flush(void __iomem *iobase) +{ + ioread8(iobase + TPM_ACCESS(0)); +} +#else +#define tpm_tis_flush(iobase) do { } while (0) +#endif + +static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr) +{ + iowrite8(b, iobase + addr); + tpm_tis_flush(iobase); +} + +static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr) +{ + iowrite32(b, iobase + addr); + tpm_tis_flush(iobase); +} + static int interrupts = -1; module_param(interrupts, int, 0444); MODULE_PARM_DESC(interrupts, "Enable interrupts"); @@ -169,7 +194,7 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len, struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); while (len--) - iowrite8(*value++, phy->iobase + addr); + tpm_tis_iowrite8(*value++, phy->iobase, addr); return 0; } @@ -196,7 +221,7 @@ static int tpm_tcg_write32(struct tpm_tis_data *data, u32 addr, u32 value) { struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); - iowrite32(value, phy->iobase + addr); + tpm_tis_iowrite32(value, phy->iobase, addr); return 0; } diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index e3df82d5d37a..5502e176d51b 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -66,7 +66,7 @@ struct mm_struct efi_mm = { struct workqueue_struct *efi_rts_wq; -static bool disable_runtime; +static bool disable_runtime = IS_ENABLED(CONFIG_PREEMPT_RT); static int __init setup_noefi(char *arg) { disable_runtime = true; @@ -97,6 +97,9 @@ static int __init parse_efi_cmdline(char *str) if (parse_option_str(str, "noruntime")) disable_runtime = true; + if (parse_option_str(str, "runtime")) + disable_runtime = false; + if (parse_option_str(str, "nosoftreserve")) set_bit(EFI_MEM_NO_SOFT_RESERVE, &efi.flags); diff --git a/drivers/gpu/drm/i915/display/intel_crtc.c b/drivers/gpu/drm/i915/display/intel_crtc.c index 254e67141a77..7a39029b083f 100644 --- a/drivers/gpu/drm/i915/display/intel_crtc.c +++ b/drivers/gpu/drm/i915/display/intel_crtc.c @@ -425,7 +425,8 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) */ intel_psr_wait_for_idle(new_crtc_state); - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); crtc->debug.min_vbl = min; crtc->debug.max_vbl = max; @@ -450,11 +451,13 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) break; } - local_irq_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); timeout = schedule_timeout(timeout); - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); } finish_wait(wq, &wait); @@ -487,7 +490,8 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) return; irq_disable: - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); } #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_VBLANK_EVADE) @@ -566,7 +570,8 @@ void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state) new_crtc_state->uapi.event = NULL; } - local_irq_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); /* Send VRR Push to terminate Vblank */ intel_vrr_send_push(new_crtc_state); diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c index 209cf265bf74..6e1b9068d944 100644 --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c @@ -311,10 +311,9 @@ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b) /* Kick the work once more to drain the signalers, and disarm the irq */ irq_work_sync(&b->irq_work); while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) { - local_irq_disable(); - signal_irq_work(&b->irq_work); - local_irq_enable(); + irq_work_queue(&b->irq_work); cond_resched(); + irq_work_sync(&b->irq_work); } } diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h index c41098950746..601274ba86e4 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.h +++ b/drivers/gpu/drm/i915/gt/intel_context.h @@ -163,7 +163,8 @@ static inline void intel_context_enter(struct intel_context *ce) static inline void intel_context_mark_active(struct intel_context *ce) { - lockdep_assert_held(&ce->timeline->mutex); + lockdep_assert(lockdep_is_held(&ce->timeline->mutex) || + test_bit(CONTEXT_IS_PARKED, &ce->flags)); ++ce->active_count; } diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h index e54351a170e2..1022be795e68 100644 --- a/drivers/gpu/drm/i915/gt/intel_context_types.h +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -112,6 +112,7 @@ struct intel_context { #define CONTEXT_FORCE_SINGLE_SUBMISSION 7 #define CONTEXT_NOPREEMPT 8 #define CONTEXT_LRCA_DIRTY 9 +#define CONTEXT_IS_PARKED 10 struct { u64 timeout_us; diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c index 1f07ac4e0672..e84f03a276d1 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c @@ -80,39 +80,6 @@ static int __engine_unpark(struct intel_wakeref *wf) return 0; } -#if IS_ENABLED(CONFIG_LOCKDEP) - -static unsigned long __timeline_mark_lock(struct intel_context *ce) -{ - unsigned long flags; - - local_irq_save(flags); - mutex_acquire(&ce->timeline->mutex.dep_map, 2, 0, _THIS_IP_); - - return flags; -} - -static void __timeline_mark_unlock(struct intel_context *ce, - unsigned long flags) -{ - mutex_release(&ce->timeline->mutex.dep_map, _THIS_IP_); - local_irq_restore(flags); -} - -#else - -static unsigned long __timeline_mark_lock(struct intel_context *ce) -{ - return 0; -} - -static void __timeline_mark_unlock(struct intel_context *ce, - unsigned long flags) -{ -} - -#endif /* !IS_ENABLED(CONFIG_LOCKDEP) */ - static void duration(struct dma_fence *fence, struct dma_fence_cb *cb) { struct i915_request *rq = to_request(fence); @@ -159,7 +126,6 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine) { struct intel_context *ce = engine->kernel_context; struct i915_request *rq; - unsigned long flags; bool result = true; /* GPU is pointing to the void, as good as in the kernel context. */ @@ -201,7 +167,7 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine) * engine->wakeref.count, we may see the request completion and retire * it causing an underflow of the engine->wakeref. */ - flags = __timeline_mark_lock(ce); + set_bit(CONTEXT_IS_PARKED, &ce->flags); GEM_BUG_ON(atomic_read(&ce->timeline->active_count) < 0); rq = __i915_request_create(ce, GFP_NOWAIT); @@ -233,7 +199,7 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine) result = false; out_unlock: - __timeline_mark_unlock(ce, flags); + clear_bit(CONTEXT_IS_PARKED, &ce->flags); return result; } diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c index cafb0608ffb4..07156996fc82 100644 --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c @@ -1283,7 +1283,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) * and context switches) submission. */ - spin_lock(&sched_engine->lock); + spin_lock_irq(&sched_engine->lock); /* * If the queue is higher priority than the last @@ -1383,7 +1383,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) * Even if ELSP[1] is occupied and not worthy * of timeslices, our queue might be. */ - spin_unlock(&sched_engine->lock); + spin_unlock_irq(&sched_engine->lock); return; } } @@ -1409,7 +1409,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) if (last && !can_merge_rq(last, rq)) { spin_unlock(&ve->base.sched_engine->lock); - spin_unlock(&engine->sched_engine->lock); + spin_unlock_irq(&engine->sched_engine->lock); return; /* leave this for another sibling */ } @@ -1571,7 +1571,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) */ sched_engine->queue_priority_hint = queue_prio(sched_engine); i915_sched_engine_reset_on_empty(sched_engine); - spin_unlock(&sched_engine->lock); + spin_unlock_irq(&sched_engine->lock); /* * We can skip poking the HW if we ended up with exactly the same set @@ -1597,13 +1597,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine) } } -static void execlists_dequeue_irq(struct intel_engine_cs *engine) -{ - local_irq_disable(); /* Suspend interrupts across request submission */ - execlists_dequeue(engine); - local_irq_enable(); /* flush irq_work (e.g. breadcrumb enabling) */ -} - static void clear_ports(struct i915_request **ports, int count) { memset_p((void **)ports, NULL, count); @@ -2423,7 +2416,7 @@ static void execlists_submission_tasklet(struct tasklet_struct *t) } if (!engine->execlists.pending[0]) { - execlists_dequeue_irq(engine); + execlists_dequeue(engine); start_timeslice(engine); } diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 9bc4f4a8e12e..547347241a47 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -886,7 +886,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, */ spin_lock_irqsave(&dev_priv->uncore.lock, irqflags); - /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); /* Get optional system timestamp before query. */ if (stime) @@ -950,7 +951,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, if (etime) *etime = ktime_get(); - /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 79da5eca60af..b9dd6100c6d1 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -559,7 +559,6 @@ bool __i915_request_submit(struct i915_request *request) RQ_TRACE(request, "\n"); - GEM_BUG_ON(!irqs_disabled()); lockdep_assert_held(&engine->sched_engine->lock); /* @@ -668,7 +667,6 @@ void __i915_request_unsubmit(struct i915_request *request) */ RQ_TRACE(request, "\n"); - GEM_BUG_ON(!irqs_disabled()); lockdep_assert_held(&engine->sched_engine->lock); /* diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h index 1bc1349ba3c2..a2f713b4ac2f 100644 --- a/drivers/gpu/drm/i915/i915_request.h +++ b/drivers/gpu/drm/i915/i915_request.h @@ -609,7 +609,8 @@ i915_request_timeline(const struct i915_request *rq) { /* Valid only while the request is being constructed (or retired). */ return rcu_dereference_protected(rq->timeline, - lockdep_is_held(&rcu_access_pointer(rq->timeline)->mutex)); + lockdep_is_held(&rcu_access_pointer(rq->timeline)->mutex) || + test_bit(CONTEXT_IS_PARKED, &rq->context->flags)); } static inline struct i915_gem_context * diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h index 63fec1c3c132..f345a0f12bf6 100644 --- a/drivers/gpu/drm/i915/i915_trace.h +++ b/drivers/gpu/drm/i915/i915_trace.h @@ -2,6 +2,10 @@ #if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) #define _I915_TRACE_H_ +#ifdef CONFIG_PREEMPT_RT +#define NOTRACE +#endif + #include #include #include @@ -819,7 +823,7 @@ DEFINE_EVENT(i915_request, i915_request_add, TP_ARGS(rq) ); -#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) +#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) && !defined(NOTRACE) DEFINE_EVENT(i915_request, i915_request_guc_submit, TP_PROTO(struct i915_request *rq), TP_ARGS(rq) diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h index 5259edacde38..b36b27c09049 100644 --- a/drivers/gpu/drm/i915/i915_utils.h +++ b/drivers/gpu/drm/i915/i915_utils.h @@ -343,7 +343,7 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms) #define wait_for(COND, MS) _wait_for((COND), (MS) * 1000, 10, 1000) /* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */ -#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) +#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) && !defined(CONFIG_PREEMPT_RT) # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic()) #else # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0) diff --git a/drivers/i2c/busses/i2c-cht-wc.c b/drivers/i2c/busses/i2c-cht-wc.c index 1cf68f85b2e1..8ccf0c928bb4 100644 --- a/drivers/i2c/busses/i2c-cht-wc.c +++ b/drivers/i2c/busses/i2c-cht-wc.c @@ -99,15 +99,8 @@ static irqreturn_t cht_wc_i2c_adap_thread_handler(int id, void *data) * interrupt handler as well, so running the client irq handler from * this thread will cause things to lock up. */ - if (reg & CHT_WC_EXTCHGRIRQ_CLIENT_IRQ) { - /* - * generic_handle_irq expects local IRQs to be disabled - * as normally it is called from interrupt context. - */ - local_irq_disable(); - generic_handle_irq(adap->client_irq); - local_irq_enable(); - } + if (reg & CHT_WC_EXTCHGRIRQ_CLIENT_IRQ) + generic_handle_irq_safe(adap->client_irq); return IRQ_HANDLED; } diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index cfbef70e8ba7..cded25be1f55 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -1422,7 +1422,7 @@ int i2c_handle_smbus_host_notify(struct i2c_adapter *adap, unsigned short addr) if (irq <= 0) return -ENXIO; - generic_handle_irq(irq); + generic_handle_irq_safe(irq); return 0; } diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig index 1f1d57288085..dc6816d36d06 100644 --- a/drivers/leds/trigger/Kconfig +++ b/drivers/leds/trigger/Kconfig @@ -64,6 +64,7 @@ config LEDS_TRIGGER_BACKLIGHT config LEDS_TRIGGER_CPU bool "LED CPU Trigger" + depends on !PREEMPT_RT help This allows LEDs to be controlled by active CPUs. This shows the active CPUs across an array of LEDs so you can see which diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e54d802ee0bb..ee79e4146c70 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2217,8 +2217,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) struct raid5_percpu *percpu; unsigned long cpu; - cpu = get_cpu(); + cpu = get_cpu_light(); percpu = per_cpu_ptr(conf->percpu, cpu); + spin_lock(&percpu->lock); if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { ops_run_biofill(sh); overlap_clear++; @@ -2277,7 +2278,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) if (test_and_clear_bit(R5_Overlap, &dev->flags)) wake_up(&sh->raid_conf->wait_for_overlap); } - put_cpu(); + spin_unlock(&percpu->lock); + put_cpu_light(); } static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) @@ -7099,6 +7101,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) __func__, cpu); return -ENOMEM; } + spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock); return 0; } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 5c05acf20e1f..665fe138ab4f 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -635,6 +635,7 @@ struct r5conf { int recovery_disabled; /* per cpu variables */ struct raid5_percpu { + spinlock_t lock; /* Protection for -RT */ struct page *spare_page; /* Used when checking P/Q in raid6 */ void *scribble; /* space for constructing buffer * lists and performing address diff --git a/drivers/mfd/ezx-pcap.c b/drivers/mfd/ezx-pcap.c index 70fa18b04ad2..b14d3f98e1eb 100644 --- a/drivers/mfd/ezx-pcap.c +++ b/drivers/mfd/ezx-pcap.c @@ -193,13 +193,11 @@ static void pcap_isr_work(struct work_struct *work) ezx_pcap_write(pcap, PCAP_REG_MSR, isr | msr); ezx_pcap_write(pcap, PCAP_REG_ISR, isr); - local_irq_disable(); service = isr & ~msr; for (irq = pcap->irq_base; service; service >>= 1, irq++) { if (service & 1) - generic_handle_irq(irq); + generic_handle_irq_safe(irq); } - local_irq_enable(); ezx_pcap_write(pcap, PCAP_REG_MSR, pcap->msr); } while (gpio_get_value(pdata->gpio)); } diff --git a/drivers/misc/hi6421v600-irq.c b/drivers/misc/hi6421v600-irq.c index 08535e97ff43..0585a5821d05 100644 --- a/drivers/misc/hi6421v600-irq.c +++ b/drivers/misc/hi6421v600-irq.c @@ -118,8 +118,8 @@ static irqreturn_t hi6421v600_irq_handler(int irq, void *__priv) * If both powerkey down and up IRQs are received, * handle them at the right order */ - generic_handle_irq(priv->irqs[POWERKEY_DOWN]); - generic_handle_irq(priv->irqs[POWERKEY_UP]); + generic_handle_irq_safe(priv->irqs[POWERKEY_DOWN]); + generic_handle_irq_safe(priv->irqs[POWERKEY_UP]); pending &= ~HISI_IRQ_POWERKEY_UP_DOWN; } @@ -127,7 +127,7 @@ static irqreturn_t hi6421v600_irq_handler(int irq, void *__priv) continue; for_each_set_bit(offset, &pending, BITS_PER_BYTE) { - generic_handle_irq(priv->irqs[offset + i * BITS_PER_BYTE]); + generic_handle_irq_safe(priv->irqs[offset + i * BITS_PER_BYTE]); } } diff --git a/drivers/net/ethernet/netronome/nfp/abm/qdisc.c b/drivers/net/ethernet/netronome/nfp/abm/qdisc.c index 2473fb5f75e5..2a5cc64227e9 100644 --- a/drivers/net/ethernet/netronome/nfp/abm/qdisc.c +++ b/drivers/net/ethernet/netronome/nfp/abm/qdisc.c @@ -458,7 +458,7 @@ nfp_abm_qdisc_graft(struct nfp_abm_link *alink, u32 handle, u32 child_handle, static void nfp_abm_stats_calculate(struct nfp_alink_stats *new, struct nfp_alink_stats *old, - struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_sync *bstats, struct gnet_stats_queue *qstats) { _bstats_update(bstats, new->tx_bytes - old->tx_bytes, diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 3e1a83a22fdd..bce0a6bd46a7 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1367,11 +1367,8 @@ static void lan78xx_status(struct lan78xx_net *dev, struct urb *urb) netif_dbg(dev, link, dev->net, "PHY INTR: 0x%08x\n", intdata); lan78xx_defer_kevent(dev, EVENT_LINK_RESET); - if (dev->domain_data.phyirq > 0) { - local_irq_disable(); - generic_handle_irq(dev->domain_data.phyirq); - local_irq_enable(); - } + if (dev->domain_data.phyirq > 0) + generic_handle_irq_safe(dev->domain_data.phyirq); } else { netdev_warn(dev->net, "unexpected interrupt: 0x%08x\n", intdata); diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c index 5ae6c207d3ac..660908027dc5 100644 --- a/drivers/scsi/fcoe/fcoe.c +++ b/drivers/scsi/fcoe/fcoe.c @@ -1450,11 +1450,11 @@ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev, static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen) { struct fcoe_percpu_s *fps; - int rc; + int rc, cpu = get_cpu_light(); - fps = &get_cpu_var(fcoe_percpu); + fps = &per_cpu(fcoe_percpu, cpu); rc = fcoe_get_paged_crc_eof(skb, tlen, fps); - put_cpu_var(fcoe_percpu); + put_cpu_light(); return rc; } @@ -1639,11 +1639,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport, return 0; } - stats = per_cpu_ptr(lport->stats, get_cpu()); + stats = per_cpu_ptr(lport->stats, get_cpu_light()); stats->InvalidCRCCount++; if (stats->InvalidCRCCount < 5) printk(KERN_WARNING "fcoe: dropping frame with CRC error\n"); - put_cpu(); + put_cpu_light(); return -EINVAL; } @@ -1684,7 +1684,7 @@ static void fcoe_recv_frame(struct sk_buff *skb) */ hp = (struct fcoe_hdr *) skb_network_header(skb); - stats = per_cpu_ptr(lport->stats, get_cpu()); + stats = per_cpu_ptr(lport->stats, get_cpu_light()); if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) { if (stats->ErrorFrames < 5) printk(KERN_WARNING "fcoe: FCoE version " @@ -1716,13 +1716,13 @@ static void fcoe_recv_frame(struct sk_buff *skb) goto drop; if (!fcoe_filter_frames(lport, fp)) { - put_cpu(); + put_cpu_light(); fc_exch_recv(lport, fp); return; } drop: stats->ErrorFrames++; - put_cpu(); + put_cpu_light(); kfree_skb(skb); } diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c index 558f3f4e1859..f08feaa4f398 100644 --- a/drivers/scsi/fcoe/fcoe_ctlr.c +++ b/drivers/scsi/fcoe/fcoe_ctlr.c @@ -828,7 +828,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip) INIT_LIST_HEAD(&del_list); - stats = per_cpu_ptr(fip->lp->stats, get_cpu()); + stats = per_cpu_ptr(fip->lp->stats, get_cpu_light()); list_for_each_entry_safe(fcf, next, &fip->fcfs, list) { deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2; @@ -864,7 +864,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip) sel_time = fcf->time; } } - put_cpu(); + put_cpu_light(); list_for_each_entry_safe(fcf, next, &del_list, list) { /* Removes fcf from current list */ diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c index aa223db4cf53..0ceb93800704 100644 --- a/drivers/scsi/libfc/fc_exch.c +++ b/drivers/scsi/libfc/fc_exch.c @@ -825,10 +825,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport, } memset(ep, 0, sizeof(*ep)); - cpu = get_cpu(); + cpu = get_cpu_light(); pool = per_cpu_ptr(mp->pool, cpu); spin_lock_bh(&pool->lock); - put_cpu(); + put_cpu_light(); /* peek cache of free slot */ if (pool->left != FC_XID_UNKNOWN) { diff --git a/drivers/staging/greybus/gpio.c b/drivers/staging/greybus/gpio.c index 7e6347fe93f9..8a7cf1d0e968 100644 --- a/drivers/staging/greybus/gpio.c +++ b/drivers/staging/greybus/gpio.c @@ -391,10 +391,7 @@ static int gb_gpio_request_handler(struct gb_operation *op) return -EINVAL; } - local_irq_disable(); - ret = generic_handle_irq(irq); - local_irq_enable(); - + ret = generic_handle_irq_safe(irq); if (ret) dev_err(dev, "failed to invoke irq handler\n"); diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h index 6473361525d1..2321d02e9b7a 100644 --- a/drivers/tty/serial/8250/8250.h +++ b/drivers/tty/serial/8250/8250.h @@ -132,12 +132,55 @@ static inline void serial_dl_write(struct uart_8250_port *up, int value) up->dl_write(up, value); } +static inline void serial8250_set_IER(struct uart_8250_port *up, + unsigned char ier) +{ + struct uart_port *port = &up->port; + unsigned long flags; + bool is_console; + + is_console = uart_console(port); + + if (is_console) + console_atomic_lock(flags); + + serial_out(up, UART_IER, ier); + + if (is_console) + console_atomic_unlock(flags); +} + +static inline unsigned char serial8250_clear_IER(struct uart_8250_port *up) +{ + struct uart_port *port = &up->port; + unsigned int clearval = 0; + unsigned long flags; + unsigned int prior; + bool is_console; + + is_console = uart_console(port); + + if (up->capabilities & UART_CAP_UUE) + clearval = UART_IER_UUE; + + if (is_console) + console_atomic_lock(flags); + + prior = serial_port_in(port, UART_IER); + serial_port_out(port, UART_IER, clearval); + + if (is_console) + console_atomic_unlock(flags); + + return prior; +} + static inline bool serial8250_set_THRI(struct uart_8250_port *up) { if (up->ier & UART_IER_THRI) return false; up->ier |= UART_IER_THRI; - serial_out(up, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); return true; } @@ -146,7 +189,7 @@ static inline bool serial8250_clear_THRI(struct uart_8250_port *up) if (!(up->ier & UART_IER_THRI)) return false; up->ier &= ~UART_IER_THRI; - serial_out(up, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); return true; } diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c index 1ce193daea7f..fad00c0414e3 100644 --- a/drivers/tty/serial/8250/8250_core.c +++ b/drivers/tty/serial/8250/8250_core.c @@ -264,10 +264,8 @@ static void serial8250_backup_timeout(struct timer_list *t) * Must disable interrupts or else we risk racing with the interrupt * based handler. */ - if (up->port.irq) { - ier = serial_in(up, UART_IER); - serial_out(up, UART_IER, 0); - } + if (up->port.irq) + ier = serial8250_clear_IER(up); iir = serial_in(up, UART_IIR); @@ -290,7 +288,7 @@ static void serial8250_backup_timeout(struct timer_list *t) serial8250_tx_chars(up); if (up->port.irq) - serial_out(up, UART_IER, ier); + serial8250_set_IER(up, ier); spin_unlock_irqrestore(&up->port.lock, flags); @@ -568,6 +566,14 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev) #ifdef CONFIG_SERIAL_8250_CONSOLE +static void univ8250_console_write_atomic(struct console *co, const char *s, + unsigned int count) +{ + struct uart_8250_port *up = &serial8250_ports[co->index]; + + serial8250_console_write_atomic(up, s, count); +} + static void univ8250_console_write(struct console *co, const char *s, unsigned int count) { @@ -661,6 +667,7 @@ static int univ8250_console_match(struct console *co, char *name, int idx, static struct console univ8250_console = { .name = "ttyS", + .write_atomic = univ8250_console_write_atomic, .write = univ8250_console_write, .device = uart_console_device, .setup = univ8250_console_setup, diff --git a/drivers/tty/serial/8250/8250_fsl.c b/drivers/tty/serial/8250/8250_fsl.c index fc65a2293ce9..19a92530040f 100644 --- a/drivers/tty/serial/8250/8250_fsl.c +++ b/drivers/tty/serial/8250/8250_fsl.c @@ -60,9 +60,18 @@ int fsl8250_handle_irq(struct uart_port *port) /* Stop processing interrupts on input overrun */ if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) { + unsigned long flags; unsigned long delay; + bool is_console; + is_console = uart_console(port); + + if (is_console) + console_atomic_lock(flags); up->ier = port->serial_in(port, UART_IER); + if (is_console) + console_atomic_unlock(flags); + if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) { port->ops->stop_rx(port); } else { diff --git a/drivers/tty/serial/8250/8250_ingenic.c b/drivers/tty/serial/8250/8250_ingenic.c index 65402d05eff9..8122645ab05c 100644 --- a/drivers/tty/serial/8250/8250_ingenic.c +++ b/drivers/tty/serial/8250/8250_ingenic.c @@ -146,6 +146,8 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic,x1000-uart", static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value) { + unsigned long flags; + bool is_console; int ier; switch (offset) { @@ -167,7 +169,12 @@ static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value) * If we have enabled modem status IRQs we should enable * modem mode. */ + is_console = uart_console(p); + if (is_console) + console_atomic_lock(flags); ier = p->serial_in(p, UART_IER); + if (is_console) + console_atomic_unlock(flags); if (ier & UART_IER_MSI) value |= UART_MCR_MDCE | UART_MCR_FCM; diff --git a/drivers/tty/serial/8250/8250_mtk.c b/drivers/tty/serial/8250/8250_mtk.c index de48a58460f4..364ee950f21a 100644 --- a/drivers/tty/serial/8250/8250_mtk.c +++ b/drivers/tty/serial/8250/8250_mtk.c @@ -222,12 +222,37 @@ static void mtk8250_shutdown(struct uart_port *port) static void mtk8250_disable_intrs(struct uart_8250_port *up, int mask) { - serial_out(up, UART_IER, serial_in(up, UART_IER) & (~mask)); + struct uart_port *port = &up->port; + unsigned long flags; + unsigned int ier; + bool is_console; + + is_console = uart_console(port); + + if (is_console) + console_atomic_lock(flags); + + ier = serial_in(up, UART_IER); + serial_out(up, UART_IER, ier & (~mask)); + + if (is_console) + console_atomic_unlock(flags); } static void mtk8250_enable_intrs(struct uart_8250_port *up, int mask) { - serial_out(up, UART_IER, serial_in(up, UART_IER) | mask); + struct uart_port *port = &up->port; + unsigned long flags; + unsigned int ier; + + if (uart_console(port)) + console_atomic_lock(flags); + + ier = serial_in(up, UART_IER); + serial_out(up, UART_IER, ier | mask); + + if (uart_console(port)) + console_atomic_unlock(flags); } static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode) diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index df9731f73746..363888c2678b 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -770,7 +770,7 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) serial_out(p, UART_EFR, UART_EFR_ECB); serial_out(p, UART_LCR, 0); } - serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0); + serial8250_set_IER(p, sleep ? UART_IERX_SLEEP : 0); if (p->capabilities & UART_CAP_EFR) { serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); serial_out(p, UART_EFR, efr); @@ -1444,7 +1444,7 @@ static void serial8250_stop_rx(struct uart_port *port) up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); up->port.read_status_mask &= ~UART_LSR_DR; - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); serial8250_rpm_put(up); } @@ -1474,7 +1474,7 @@ void serial8250_em485_stop_tx(struct uart_8250_port *p) serial8250_clear_and_reinit_fifos(p); p->ier |= UART_IER_RLSI | UART_IER_RDI; - serial_port_out(&p->port, UART_IER, p->ier); + serial8250_set_IER(p, p->ier); } } EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx); @@ -1710,7 +1710,7 @@ static void serial8250_disable_ms(struct uart_port *port) mctrl_gpio_disable_ms(up->gpios); up->ier &= ~UART_IER_MSI; - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); } static void serial8250_enable_ms(struct uart_port *port) @@ -1726,7 +1726,7 @@ static void serial8250_enable_ms(struct uart_port *port) up->ier |= UART_IER_MSI; serial8250_rpm_get(up); - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); serial8250_rpm_put(up); } @@ -2145,14 +2145,7 @@ static void serial8250_put_poll_char(struct uart_port *port, struct uart_8250_port *up = up_to_u8250p(port); serial8250_rpm_get(up); - /* - * First save the IER then disable the interrupts - */ - ier = serial_port_in(port, UART_IER); - if (up->capabilities & UART_CAP_UUE) - serial_port_out(port, UART_IER, UART_IER_UUE); - else - serial_port_out(port, UART_IER, 0); + ier = serial8250_clear_IER(up); wait_for_xmitr(up, BOTH_EMPTY); /* @@ -2165,7 +2158,7 @@ static void serial8250_put_poll_char(struct uart_port *port, * and restore the IER */ wait_for_xmitr(up, BOTH_EMPTY); - serial_port_out(port, UART_IER, ier); + serial8250_set_IER(up, ier); serial8250_rpm_put(up); } @@ -2468,7 +2461,7 @@ void serial8250_do_shutdown(struct uart_port *port) */ spin_lock_irqsave(&port->lock, flags); up->ier = 0; - serial_port_out(port, UART_IER, 0); + serial8250_set_IER(up, 0); spin_unlock_irqrestore(&port->lock, flags); synchronize_irq(port->irq); @@ -2850,7 +2843,7 @@ serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios, if (up->capabilities & UART_CAP_RTOIE) up->ier |= UART_IER_RTOIE; - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); if (up->capabilities & UART_CAP_EFR) { unsigned char efr = 0; @@ -3316,7 +3309,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_defaults); #ifdef CONFIG_SERIAL_8250_CONSOLE -static void serial8250_console_putchar(struct uart_port *port, int ch) +static void serial8250_console_putchar_locked(struct uart_port *port, int ch) { struct uart_8250_port *up = up_to_u8250p(port); @@ -3324,6 +3317,18 @@ static void serial8250_console_putchar(struct uart_port *port, int ch) serial_port_out(port, UART_TX, ch); } +static void serial8250_console_putchar(struct uart_port *port, int ch) +{ + struct uart_8250_port *up = up_to_u8250p(port); + unsigned long flags; + + wait_for_xmitr(up, UART_LSR_THRE); + + console_atomic_lock(flags); + serial8250_console_putchar_locked(port, ch); + console_atomic_unlock(flags); +} + /* * Restore serial console when h/w power-off detected */ @@ -3345,6 +3350,32 @@ static void serial8250_console_restore(struct uart_8250_port *up) serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS); } +void serial8250_console_write_atomic(struct uart_8250_port *up, + const char *s, unsigned int count) +{ + struct uart_port *port = &up->port; + unsigned long flags; + unsigned int ier; + + console_atomic_lock(flags); + + touch_nmi_watchdog(); + + ier = serial8250_clear_IER(up); + + if (atomic_fetch_inc(&up->console_printing)) { + uart_console_write(port, "\n", 1, + serial8250_console_putchar_locked); + } + uart_console_write(port, s, count, serial8250_console_putchar_locked); + atomic_dec(&up->console_printing); + + wait_for_xmitr(up, BOTH_EMPTY); + serial8250_set_IER(up, ier); + + console_atomic_unlock(flags); +} + /* * Print a string to the serial port trying not to disturb * any possible real use of the port... @@ -3361,24 +3392,12 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, struct uart_port *port = &up->port; unsigned long flags; unsigned int ier; - int locked = 1; touch_nmi_watchdog(); - if (oops_in_progress) - locked = spin_trylock_irqsave(&port->lock, flags); - else - spin_lock_irqsave(&port->lock, flags); - - /* - * First save the IER then disable the interrupts - */ - ier = serial_port_in(port, UART_IER); + spin_lock_irqsave(&port->lock, flags); - if (up->capabilities & UART_CAP_UUE) - serial_port_out(port, UART_IER, UART_IER_UUE); - else - serial_port_out(port, UART_IER, 0); + ier = serial8250_clear_IER(up); /* check scratch reg to see if port powered off during system sleep */ if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { @@ -3392,7 +3411,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, mdelay(port->rs485.delay_rts_before_send); } + atomic_inc(&up->console_printing); uart_console_write(port, s, count, serial8250_console_putchar); + atomic_dec(&up->console_printing); /* * Finally, wait for transmitter to become empty @@ -3405,8 +3426,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, if (em485->tx_stopped) up->rs485_stop_tx(up); } - - serial_port_out(port, UART_IER, ier); + serial8250_set_IER(up, ier); /* * The receive handling will happen properly because the @@ -3418,8 +3438,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, if (up->msr_saved_flags) serial8250_modem_status(up); - if (locked) - spin_unlock_irqrestore(&port->lock, flags); + spin_unlock_irqrestore(&port->lock, flags); } static unsigned int probe_baud(struct uart_port *port) @@ -3439,6 +3458,7 @@ static unsigned int probe_baud(struct uart_port *port) int serial8250_console_setup(struct uart_port *port, char *options, bool probe) { + struct uart_8250_port *up = up_to_u8250p(port); int baud = 9600; int bits = 8; int parity = 'n'; @@ -3448,6 +3468,8 @@ int serial8250_console_setup(struct uart_port *port, char *options, bool probe) if (!port->iobase && !port->membase) return -ENODEV; + atomic_set(&up->console_printing, 0); + if (options) uart_parse_options(options, &baud, &parity, &bits, &flow); else if (probe) diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c index 3d40306971b8..aa02ac32f4bc 100644 --- a/drivers/tty/serial/amba-pl011.c +++ b/drivers/tty/serial/amba-pl011.c @@ -2322,18 +2322,24 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) { struct uart_amba_port *uap = amba_ports[co->index]; unsigned int old_cr = 0, new_cr; - unsigned long flags; + unsigned long flags = 0; int locked = 1; clk_enable(uap->clk); - local_irq_save(flags); + /* + * local_irq_save(flags); + * + * This local_irq_save() is nonsense. If we come in via sysrq + * handling then interrupts are already disabled. Aside of + * that the port.sysrq check is racy on SMP regardless. + */ if (uap->port.sysrq) locked = 0; else if (oops_in_progress) - locked = spin_trylock(&uap->port.lock); + locked = spin_trylock_irqsave(&uap->port.lock, flags); else - spin_lock(&uap->port.lock); + spin_lock_irqsave(&uap->port.lock, flags); /* * First save the CR then disable the interrupts @@ -2359,8 +2365,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) pl011_write(old_cr, uap, REG_CR); if (locked) - spin_unlock(&uap->port.lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&uap->port.lock, flags); clk_disable(uap->clk); } diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c index 0862941862c8..10970632f0e4 100644 --- a/drivers/tty/serial/omap-serial.c +++ b/drivers/tty/serial/omap-serial.c @@ -1255,13 +1255,10 @@ serial_omap_console_write(struct console *co, const char *s, unsigned int ier; int locked = 1; - local_irq_save(flags); - if (up->port.sysrq) - locked = 0; - else if (oops_in_progress) - locked = spin_trylock(&up->port.lock); + if (up->port.sysrq || oops_in_progress) + locked = spin_trylock_irqsave(&up->port.lock, flags); else - spin_lock(&up->port.lock); + spin_lock_irqsave(&up->port.lock, flags); /* * First save the IER then disable the interrupts @@ -1288,8 +1285,7 @@ serial_omap_console_write(struct console *co, const char *s, check_modem_status(up); if (locked) - spin_unlock(&up->port.lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&up->port.lock, flags); } static int __init diff --git a/drivers/virt/acrn/irqfd.c b/drivers/virt/acrn/irqfd.c index df5184979b28..d4ad211dce7a 100644 --- a/drivers/virt/acrn/irqfd.c +++ b/drivers/virt/acrn/irqfd.c @@ -17,7 +17,6 @@ #include "acrn_drv.h" static LIST_HEAD(acrn_irqfd_clients); -static DEFINE_MUTEX(acrn_irqfds_mutex); /** * struct hsm_irqfd - Properties of HSM irqfd diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index 45cfd50a9521..502b56597f10 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -239,7 +239,7 @@ int afs_silly_iput(struct dentry *dentry, struct inode *inode) struct dentry *alias; int ret; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); _enter("%p{%pd},%llx", dentry, dentry, vnode->fid.vnode); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 1929e80c09ee..48eb8c30c6db 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -69,7 +69,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, struct inode *inode; struct super_block *sb = parent->d_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); diff --git a/fs/dcache.c b/fs/dcache.c index cf871a81f4fd..02db80f2817f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2537,7 +2537,13 @@ EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) { - + /* + * The caller has a spinlock_t (dentry::d_lock) acquired which disables + * preemption on !PREEMPT_RT. On PREEMPT_RT the lock does not disable + * preemption and it has be done explicitly. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); for (;;) { unsigned n = dir->i_dir_seq; if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) @@ -2549,25 +2555,30 @@ static inline unsigned start_dir_add(struct inode *dir) static inline void end_dir_add(struct inode *dir, unsigned n) { smp_store_release(&dir->i_dir_seq, n + 2); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } static void d_wait_lookup(struct dentry *dentry) { - if (d_in_lookup(dentry)) { - DECLARE_WAITQUEUE(wait, current); - add_wait_queue(dentry->d_wait, &wait); - do { - set_current_state(TASK_UNINTERRUPTIBLE); - spin_unlock(&dentry->d_lock); - schedule(); - spin_lock(&dentry->d_lock); - } while (d_in_lookup(dentry)); - } + struct swait_queue __wait; + + if (!d_in_lookup(dentry)) + return; + + INIT_LIST_HEAD(&__wait.task_list); + do { + prepare_to_swait_exclusive(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&dentry->d_lock); + schedule(); + spin_lock(&dentry->d_lock); + } while (d_in_lookup(dentry)); + finish_swait(dentry->d_wait, &__wait); } struct dentry *d_alloc_parallel(struct dentry *parent, const struct qstr *name, - wait_queue_head_t *wq) + struct swait_queue_head *wq) { unsigned int hash = name->hash; struct hlist_bl_head *b = in_lookup_hash(parent, hash); @@ -2682,7 +2693,7 @@ void __d_lookup_done(struct dentry *dentry) hlist_bl_lock(b); dentry->d_flags &= ~DCACHE_PAR_LOOKUP; __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); - wake_up_all(dentry->d_wait); + swake_up_all(dentry->d_wait); dentry->d_wait = NULL; hlist_bl_unlock(b); INIT_HLIST_NODE(&dentry->d_u.d_alias); diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index c3e4804b8fcb..9edb87e11680 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -81,7 +81,6 @@ extern unsigned fscache_debug; extern struct kobject *fscache_root; extern struct workqueue_struct *fscache_object_wq; extern struct workqueue_struct *fscache_op_wq; -DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); extern unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n); diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 4207f98e405f..85f8cf3a323d 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -41,8 +41,6 @@ struct kobject *fscache_root; struct workqueue_struct *fscache_object_wq; struct workqueue_struct *fscache_op_wq; -DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); - /* these values serve as lower bounds, will be adjusted in fscache_init() */ static unsigned fscache_object_max_active = 4; static unsigned fscache_op_max_active = 2; @@ -138,7 +136,6 @@ unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n) static int __init fscache_init(void) { unsigned int nr_cpus = num_possible_cpus(); - unsigned int cpu; int ret; fscache_object_max_active = @@ -161,9 +158,6 @@ static int __init fscache_init(void) if (!fscache_op_wq) goto error_op_wq; - for_each_possible_cpu(cpu) - init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu)); - ret = fscache_proc_init(); if (ret < 0) goto error_proc; diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 6a675652129b..7a972d144b54 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -798,6 +798,8 @@ void fscache_object_destroy(struct fscache_object *object) } EXPORT_SYMBOL(fscache_object_destroy); +static DECLARE_WAIT_QUEUE_HEAD(fscache_object_cong_wait); + /* * enqueue an object for metadata-type processing */ @@ -806,16 +808,12 @@ void fscache_enqueue_object(struct fscache_object *object) _enter("{OBJ%x}", object->debug_id); if (fscache_get_object(object, fscache_obj_get_queue) >= 0) { - wait_queue_head_t *cong_wq = - &get_cpu_var(fscache_object_cong_wait); if (queue_work(fscache_object_wq, &object->work)) { if (fscache_object_congested()) - wake_up(cong_wq); + wake_up(&fscache_object_cong_wait); } else fscache_put_object(object, fscache_obj_put_queue); - - put_cpu_var(fscache_object_cong_wait); } } @@ -833,16 +831,15 @@ void fscache_enqueue_object(struct fscache_object *object) */ bool fscache_object_sleep_till_congested(signed long *timeoutp) { - wait_queue_head_t *cong_wq = this_cpu_ptr(&fscache_object_cong_wait); DEFINE_WAIT(wait); if (fscache_object_congested()) return true; - add_wait_queue_exclusive(cong_wq, &wait); + add_wait_queue_exclusive(&fscache_object_cong_wait, &wait); if (!fscache_object_congested()) *timeoutp = schedule_timeout(*timeoutp); - finish_wait(cong_wq, &wait); + finish_wait(&fscache_object_cong_wait, &wait); return fscache_object_congested(); } diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index bc267832310c..3176913fae6c 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -158,7 +158,7 @@ static int fuse_direntplus_link(struct file *file, struct inode *dir = d_inode(parent); struct fuse_conn *fc; struct inode *inode; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); if (!o->nodeid) { /* diff --git a/fs/namei.c b/fs/namei.c index 2ea15d043412..383f9fd2daaa 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1633,7 +1633,7 @@ static struct dentry *__lookup_slow(const struct qstr *name, { struct dentry *dentry, *old; struct inode *inode = dir->d_inode; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); /* Don't go there if it's already dead */ if (unlikely(IS_DEADDIR(inode))) @@ -3244,7 +3244,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, struct dentry *dentry; int error, create_error = 0; umode_t mode = op->mode; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); if (unlikely(IS_DEADDIR(dir_inode))) return ERR_PTR(-ENOENT); diff --git a/fs/namespace.c b/fs/namespace.c index b696543adab8..4799232935ee 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -343,8 +343,24 @@ int __mnt_want_write(struct vfsmount *m) * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); - while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) - cpu_relax(); + might_lock(&mount_lock.lock); + while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + cpu_relax(); + } else { + /* + * This prevents priority inversion, if the task + * setting MNT_WRITE_HOLD got preempted on a remote + * CPU, and it prevents life lock if the task setting + * MNT_WRITE_HOLD has a lower priority and is bound to + * the same CPU as the task that is spinning here. + */ + preempt_enable(); + lock_mount_hash(); + unlock_mount_hash(); + preempt_disable(); + } + } /* * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will * be set to match its requirements. So we must not load that until diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 78219396788b..06bde5728e2f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -636,7 +636,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, unsigned long dir_verifier) { struct qstr filename = QSTR_INIT(entry->name, entry->len); - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); struct dentry *dentry; struct dentry *alias; struct inode *inode; @@ -1867,7 +1867,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned open_flags, umode_t mode) { - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); struct nfs_open_context *ctx; struct dentry *res; struct iattr attr = { .ia_valid = ATTR_OPEN }; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index d5ccf095b2a7..0944c068f5cb 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include @@ -184,7 +184,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name) data->cred = get_current_cred(); data->res.dir_attr = &data->dir_attr; - init_waitqueue_head(&data->wq); + init_swait_queue_head(&data->wq); status = -EBUSY; spin_lock(&dentry->d_lock); diff --git a/fs/proc/base.c b/fs/proc/base.c index 1f394095eb88..fade2c7c705b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -96,6 +96,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -2043,7 +2044,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, child = d_hash_and_lookup(dir, &qname); if (!child) { - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) goto end_instantiate; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 5d66faecd4ef..619d8e114646 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -678,7 +678,7 @@ static bool proc_sys_fill_cache(struct file *file, child = d_lookup(dir, &qname); if (!child) { - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) return false; diff --git a/include/asm-generic/softirq_stack.h b/include/asm-generic/softirq_stack.h index eceeecf6a5bd..d3e2d81656e0 100644 --- a/include/asm-generic/softirq_stack.h +++ b/include/asm-generic/softirq_stack.h @@ -2,7 +2,7 @@ #ifndef __ASM_GENERIC_SOFTIRQ_STACK_H #define __ASM_GENERIC_SOFTIRQ_STACK_H -#ifdef CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK +#if defined(CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK) && !defined(CONFIG_PREEMPT_RT) void do_softirq_own_stack(void); #else static inline void do_softirq_own_stack(void) diff --git a/include/linux/console.h b/include/linux/console.h index a97f277cfdfa..487a4266ab2c 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -16,6 +16,13 @@ #include #include +#include +#include + +struct latched_seq { + seqcount_latch_t latch; + u64 val[2]; +}; struct vc_data; struct console_font_op; @@ -136,10 +143,12 @@ static inline int con_debug_leave(void) #define CON_ANYTIME (16) /* Safe to call when cpu is offline */ #define CON_BRL (32) /* Used for a braille device */ #define CON_EXTENDED (64) /* Use the extended output format a la /dev/kmsg */ +#define CON_HANDOVER (128) /* Device was previously a boot console. */ struct console { char name[16]; void (*write)(struct console *, const char *, unsigned); + void (*write_atomic)(struct console *co, const char *s, unsigned int count); int (*read)(struct console *, char *, unsigned); struct tty_driver *(*device)(struct console *, int *); void (*unblank)(void); @@ -149,6 +158,16 @@ struct console { short flags; short index; int cflag; +#ifdef CONFIG_PRINTK + char sync_buf[CONSOLE_LOG_MAX]; + struct latched_seq printk_seq; + struct latched_seq printk_sync_seq; +#ifdef CONFIG_HAVE_NMI + struct latched_seq printk_sync_nmi_seq; +#endif +#endif /* CONFIG_PRINTK */ + + struct task_struct *thread; uint ispeed; uint ospeed; void *data; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 9e23d33bb6f1..9f89d4887e35 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -108,7 +108,7 @@ struct dentry { union { struct list_head d_lru; /* LRU list */ - wait_queue_head_t *d_wait; /* in-lookup ones only */ + struct swait_queue_head *d_wait; /* in-lookup ones only */ }; struct list_head d_child; /* child of parent list */ struct list_head d_subdirs; /* our children */ @@ -240,7 +240,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op extern struct dentry * d_alloc(struct dentry *, const struct qstr *); extern struct dentry * d_alloc_anon(struct super_block *); extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, - wait_queue_head_t *); + struct swait_queue_head *); extern struct dentry * d_splice_alias(struct inode *, struct dentry *); extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); extern struct dentry * d_exact_alias(struct dentry *, struct inode *); diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 2e2b8d6140ed..71064a2c2caf 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -57,9 +57,15 @@ # define ARCH_EXIT_TO_USER_MODE_WORK (0) #endif +#ifdef CONFIG_PREEMPT_LAZY +# define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) +#else +# define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED) +#endif + #define EXIT_TO_USER_MODE_WORK \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ + _TIF_NEED_RESCHED_MASK | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ ARCH_EXIT_TO_USER_MODE_WORK) /** diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index ec2a47a81e42..8cd11a223260 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -3,6 +3,7 @@ #define _LINUX_IRQ_WORK_H #include +#include /* * An entry can be in one of four states: @@ -16,11 +17,13 @@ struct irq_work { struct __call_single_node node; void (*func)(struct irq_work *); + struct rcuwait irqwait; }; #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \ .node = { .u_flags = (_flags), }, \ .func = (_func), \ + .irqwait = __RCUWAIT_INITIALIZER(irqwait), \ } #define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0) @@ -46,6 +49,11 @@ static inline bool irq_work_is_busy(struct irq_work *work) return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY; } +static inline bool irq_work_is_hard(struct irq_work *work) +{ + return atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ; +} + bool irq_work_queue(struct irq_work *work); bool irq_work_queue_on(struct irq_work *work, int cpu); diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 59aea39785bf..d69b819b53e0 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -160,6 +160,7 @@ static inline void generic_handle_irq_desc(struct irq_desc *desc) int handle_irq_desc(struct irq_desc *desc); int generic_handle_irq(unsigned int irq); +int generic_handle_irq_safe(unsigned int irq); #ifdef CONFIG_IRQ_DOMAIN /* diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 600c10da321a..4b140938b03e 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -71,14 +71,6 @@ do { \ do { \ __this_cpu_dec(hardirq_context); \ } while (0) -# define lockdep_softirq_enter() \ -do { \ - current->softirq_context++; \ -} while (0) -# define lockdep_softirq_exit() \ -do { \ - current->softirq_context--; \ -} while (0) # define lockdep_hrtimer_enter(__hrtimer) \ ({ \ @@ -140,6 +132,21 @@ do { \ # define lockdep_irq_work_exit(__work) do { } while (0) #endif +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT) +# define lockdep_softirq_enter() \ +do { \ + current->softirq_context++; \ +} while (0) +# define lockdep_softirq_exit() \ +do { \ + current->softirq_context--; \ +} while (0) + +#else +# define lockdep_softirq_enter() do { } while (0) +# define lockdep_softirq_exit() do { } while (0) +#endif + #if defined(CONFIG_IRQSOFF_TRACER) || \ defined(CONFIG_PREEMPT_TRACER) extern void stop_critical_timings(void); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f56cd8879a59..49f1e924b6e6 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -111,8 +111,8 @@ static __always_inline void might_resched(void) #endif /* CONFIG_PREEMPT_* */ #ifdef CONFIG_DEBUG_ATOMIC_SLEEP -extern void ___might_sleep(const char *file, int line, int preempt_offset); -extern void __might_sleep(const char *file, int line, int preempt_offset); +extern void __might_resched(const char *file, int line, unsigned int offsets); +extern void __might_sleep(const char *file, int line); extern void __cant_sleep(const char *file, int line, int preempt_offset); extern void __cant_migrate(const char *file, int line); @@ -129,7 +129,7 @@ extern void __cant_migrate(const char *file, int line); * supposed to. */ # define might_sleep() \ - do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) + do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) /** * cant_sleep - annotation for functions that cannot sleep * @@ -168,10 +168,9 @@ extern void __cant_migrate(const char *file, int line); */ # define non_block_end() WARN_ON(current->non_block_count-- == 0) #else - static inline void ___might_sleep(const char *file, int line, - int preempt_offset) { } - static inline void __might_sleep(const char *file, int line, - int preempt_offset) { } + static inline void __might_resched(const char *file, int line, + unsigned int offsets) { } +static inline void __might_sleep(const char *file, int line) { } # define might_sleep() do { might_resched(); } while (0) # define cant_sleep() do { } while (0) # define cant_migrate() do { } while (0) diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 258cdde8d356..9bca0d98db5a 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -212,6 +212,8 @@ extern void kgdb_call_nmi_hook(void *ignored); */ extern void kgdb_roundup_cpus(void); +extern void kgdb_roundup_cpu(unsigned int cpu); + /** * kgdb_arch_set_pc - Generic call back to the program counter * @regs: Current &struct pt_regs. @@ -365,5 +367,6 @@ extern void kgdb_free_init_mem(void); #define dbg_late_init() static inline void kgdb_panic(const char *msg) {} static inline void kgdb_free_init_mem(void) { } +static inline void kgdb_roundup_cpu(unsigned int cpu) {} #endif /* ! CONFIG_KGDB */ #endif /* _KGDB_H_ */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7f8ee09c711f..e9672de22cf2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -572,6 +573,9 @@ struct mm_struct { bool tlb_flush_batched; #endif struct uprobes_state uprobes_state; +#ifdef CONFIG_PREEMPT_RT + struct rcu_head delayed_drop; +#endif #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 39f1893ecac0..014209b07b3a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1916,7 +1916,6 @@ enum netdev_ml_priv_type { * @sfp_bus: attached &struct sfp_bus structure. * * @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock - * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount * * @proto_down: protocol port state information can be sent to the * switch driver and used to set the phys state of the @@ -2250,7 +2249,6 @@ struct net_device { struct phy_device *phydev; struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; - struct lock_class_key *qdisc_running_key; bool proto_down; unsigned wol_enabled:1; unsigned threaded:1; @@ -2360,13 +2358,11 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, #define netdev_lockdep_set_classes(dev) \ { \ static struct lock_class_key qdisc_tx_busylock_key; \ - static struct lock_class_key qdisc_running_key; \ static struct lock_class_key qdisc_xmit_lock_key; \ static struct lock_class_key dev_addr_list_lock_key; \ unsigned int i; \ \ (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ - (dev)->qdisc_running_key = &qdisc_running_key; \ lockdep_set_class(&(dev)->addr_list_lock, \ &dev_addr_list_lock_key); \ for (i = 0; i < (dev)->num_tx_queues; i++) \ diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index ecd74cc34797..6af28750625a 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1692,7 +1692,7 @@ struct nfs_unlinkdata { struct nfs_removeargs args; struct nfs_removeres res; struct dentry *dentry; - wait_queue_head_t wq; + struct swait_queue_head wq; const struct cred *cred; struct nfs_fattr dir_attr; long timeout; diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 4d244e295e85..3da73c968211 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -122,9 +122,10 @@ * The preempt_count offset after spin_lock() */ #if !defined(CONFIG_PREEMPT_RT) -#define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET +#define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET #else -#define PREEMPT_LOCK_OFFSET 0 +/* Locks on RT do not disable preemption */ +#define PREEMPT_LOCK_OFFSET 0 #endif /* @@ -174,6 +175,20 @@ extern void preempt_count_sub(int val); #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) +#ifdef CONFIG_PREEMPT_LAZY +#define add_preempt_lazy_count(val) do { preempt_lazy_count() += (val); } while (0) +#define sub_preempt_lazy_count(val) do { preempt_lazy_count() -= (val); } while (0) +#define inc_preempt_lazy_count() add_preempt_lazy_count(1) +#define dec_preempt_lazy_count() sub_preempt_lazy_count(1) +#define preempt_lazy_count() (current_thread_info()->preempt_lazy_count) +#else +#define add_preempt_lazy_count(val) do { } while (0) +#define sub_preempt_lazy_count(val) do { } while (0) +#define inc_preempt_lazy_count() do { } while (0) +#define dec_preempt_lazy_count() do { } while (0) +#define preempt_lazy_count() (0) +#endif + #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ @@ -182,13 +197,25 @@ do { \ barrier(); \ } while (0) +#define preempt_lazy_disable() \ +do { \ + inc_preempt_lazy_count(); \ + barrier(); \ +} while (0) + #define sched_preempt_enable_no_resched() \ do { \ barrier(); \ preempt_count_dec(); \ } while (0) -#define preempt_enable_no_resched() sched_preempt_enable_no_resched() +#ifndef CONFIG_PREEMPT_RT +# define preempt_enable_no_resched() sched_preempt_enable_no_resched() +# define preempt_check_resched_rt() barrier(); +#else +# define preempt_enable_no_resched() preempt_enable() +# define preempt_check_resched_rt() preempt_check_resched() +#endif #define preemptible() (preempt_count() == 0 && !irqs_disabled()) @@ -213,6 +240,18 @@ do { \ __preempt_schedule(); \ } while (0) +/* + * open code preempt_check_resched() because it is not exported to modules and + * used by local_unlock() or bpf_enable_instrumentation(). + */ +#define preempt_lazy_enable() \ +do { \ + dec_preempt_lazy_count(); \ + barrier(); \ + if (should_resched(0)) \ + __preempt_schedule(); \ +} while (0) + #else /* !CONFIG_PREEMPTION */ #define preempt_enable() \ do { \ @@ -220,6 +259,12 @@ do { \ preempt_count_dec(); \ } while (0) +#define preempt_lazy_enable() \ +do { \ + dec_preempt_lazy_count(); \ + barrier(); \ +} while (0) + #define preempt_enable_notrace() \ do { \ barrier(); \ @@ -258,8 +303,12 @@ do { \ #define preempt_disable_notrace() barrier() #define preempt_enable_no_resched_notrace() barrier() #define preempt_enable_notrace() barrier() +#define preempt_check_resched_rt() barrier() #define preemptible() 0 +#define preempt_lazy_disable() barrier() +#define preempt_lazy_enable() barrier() + #endif /* CONFIG_PREEMPT_COUNT */ #ifdef MODULE @@ -278,7 +327,7 @@ do { \ } while (0) #define preempt_fold_need_resched() \ do { \ - if (tif_need_resched()) \ + if (tif_need_resched_now()) \ set_preempt_need_resched(); \ } while (0) @@ -394,8 +443,15 @@ extern void migrate_enable(void); #else -static inline void migrate_disable(void) { } -static inline void migrate_enable(void) { } +static inline void migrate_disable(void) +{ + preempt_lazy_disable(); +} + +static inline void migrate_enable(void) +{ + preempt_lazy_enable(); +} #endif /* CONFIG_SMP */ diff --git a/include/linux/printk.h b/include/linux/printk.h index 9497f6b98339..f1b9cd8d11d6 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -47,6 +47,12 @@ static inline const char *printk_skip_headers(const char *buffer) #define CONSOLE_EXT_LOG_MAX 8192 +/* + * The maximum size of a record formatted for console printing + * (i.e. with the prefix prepended to every line). + */ +#define CONSOLE_LOG_MAX 1024 + /* printk's without a loglevel use this.. */ #define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT @@ -155,20 +161,7 @@ int vprintk(const char *fmt, va_list args); asmlinkage __printf(1, 2) __cold int _printk(const char *fmt, ...); -/* - * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ ! - */ -__printf(1, 2) __cold int _printk_deferred(const char *fmt, ...); - -extern void __printk_safe_enter(void); -extern void __printk_safe_exit(void); -/* - * The printk_deferred_enter/exit macros are available only as a hack for - * some code paths that need to defer all printk console printing. Interrupts - * must be disabled for the deferred duration. - */ -#define printk_deferred_enter __printk_safe_enter -#define printk_deferred_exit __printk_safe_exit +bool pr_flush(int timeout_ms, bool reset_on_progress); /* * Please don't use printk_ratelimit(), because it shares ratelimiting state @@ -210,18 +203,10 @@ int _printk(const char *s, ...) { return 0; } -static inline __printf(1, 2) __cold -int _printk_deferred(const char *s, ...) -{ - return 0; -} - -static inline void printk_deferred_enter(void) -{ -} -static inline void printk_deferred_exit(void) +static inline bool pr_flush(int timeout_ms, bool reset_on_progress) { + return true; } static inline int printk_ratelimit(void) @@ -284,17 +269,30 @@ static inline void printk_trigger_flush(void) extern int __printk_cpu_trylock(void); extern void __printk_wait_on_cpu_lock(void); extern void __printk_cpu_unlock(void); +extern bool kgdb_roundup_delay(unsigned int cpu); + +#else + +#define __printk_cpu_trylock() 1 +#define __printk_wait_on_cpu_lock() +#define __printk_cpu_unlock() + +static inline bool kgdb_roundup_delay(unsigned int cpu) +{ + return false; +} +#endif /* CONFIG_SMP */ /** - * printk_cpu_lock_irqsave() - Acquire the printk cpu-reentrant spinning - * lock and disable interrupts. + * raw_printk_cpu_lock_irqsave() - Acquire the printk cpu-reentrant spinning + * lock and disable interrupts. * @flags: Stack-allocated storage for saving local interrupt state, - * to be passed to printk_cpu_unlock_irqrestore(). + * to be passed to raw_printk_cpu_unlock_irqrestore(). * * If the lock is owned by another CPU, spin until it becomes available. * Interrupts are restored while spinning. */ -#define printk_cpu_lock_irqsave(flags) \ +#define raw_printk_cpu_lock_irqsave(flags) \ for (;;) { \ local_irq_save(flags); \ if (__printk_cpu_trylock()) \ @@ -304,22 +302,30 @@ extern void __printk_cpu_unlock(void); } /** - * printk_cpu_unlock_irqrestore() - Release the printk cpu-reentrant spinning - * lock and restore interrupts. - * @flags: Caller's saved interrupt state, from printk_cpu_lock_irqsave(). + * raw_printk_cpu_unlock_irqrestore() - Release the printk cpu-reentrant + * spinning lock and restore interrupts. + * @flags: Caller's saved interrupt state from raw_printk_cpu_lock_irqsave(). */ -#define printk_cpu_unlock_irqrestore(flags) \ +#define raw_printk_cpu_unlock_irqrestore(flags) \ do { \ __printk_cpu_unlock(); \ local_irq_restore(flags); \ - } while (0) \ - -#else + } while (0) -#define printk_cpu_lock_irqsave(flags) ((void)flags) -#define printk_cpu_unlock_irqrestore(flags) ((void)flags) +/* + * Used to synchronize atomic consoles. + * + * The same as raw_printk_cpu_lock_irqsave() except that hardware interrupts + * are _not_ restored while spinning. + */ +#define console_atomic_lock(flags) \ + do { \ + local_irq_save(flags); \ + while (!__printk_cpu_trylock()) \ + cpu_relax(); \ + } while (0) -#endif /* CONFIG_SMP */ +#define console_atomic_unlock raw_printk_cpu_unlock_irqrestore extern int kptr_restrict; @@ -448,8 +454,6 @@ struct pi_entry { * See the vsnprintf() documentation for format string extensions over C99. */ #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__) -#define printk_deferred(fmt, ...) \ - printk_index_wrap(_printk_deferred, fmt, ##__VA_ARGS__) /** * pr_emerg - Print an emergency-level message @@ -587,13 +591,9 @@ struct pi_entry { #ifdef CONFIG_PRINTK #define printk_once(fmt, ...) \ DO_ONCE_LITE(printk, fmt, ##__VA_ARGS__) -#define printk_deferred_once(fmt, ...) \ - DO_ONCE_LITE(printk_deferred, fmt, ##__VA_ARGS__) #else #define printk_once(fmt, ...) \ no_printk(fmt, ##__VA_ARGS__) -#define printk_deferred_once(fmt, ...) \ - no_printk(fmt, ##__VA_ARGS__) #endif #define pr_emerg_once(fmt, ...) \ diff --git a/include/linux/ratelimit_types.h b/include/linux/ratelimit_types.h index b676aa419eef..c21c7f8103e2 100644 --- a/include/linux/ratelimit_types.h +++ b/include/linux/ratelimit_types.h @@ -4,7 +4,7 @@ #include #include -#include +#include #define DEFAULT_RATELIMIT_INTERVAL (5 * HZ) #define DEFAULT_RATELIMIT_BURST 10 diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 434d12fe2d4f..de6d1a21f113 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -94,6 +94,13 @@ void rcu_init_tasks_generic(void); static inline void rcu_init_tasks_generic(void) { } #endif +#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TASKS_RCU_GENERIC) +void rcu_tasks_initiate_self_tests(void); +#else +static inline void rcu_tasks_initiate_self_tests(void) {} +#endif + + #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 9deedfeec2b1..7d049883a08a 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -99,13 +99,22 @@ extern void __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass); +extern void _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock); #define rt_mutex_lock(lock) rt_mutex_lock_nested(lock, 0) +#define rt_mutex_lock_nest_lock(lock, nest_lock) \ + do { \ + typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ + _rt_mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \ + } while (0) + #else extern void rt_mutex_lock(struct rt_mutex *lock); #define rt_mutex_lock_nested(lock, subclass) rt_mutex_lock(lock) +#define rt_mutex_lock_nest_lock(lock, nest_lock) rt_mutex_lock(lock) #endif extern int rt_mutex_lock_interruptible(struct rt_mutex *lock); +extern int rt_mutex_lock_killable(struct rt_mutex *lock); extern int rt_mutex_trylock(struct rt_mutex *lock); extern void rt_mutex_unlock(struct rt_mutex *lock); diff --git a/include/linux/sched.h b/include/linux/sched.h index ad7ff332a0ac..20efdf15c2b9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -118,12 +118,8 @@ struct task_group; #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) -#define task_is_traced(task) ((READ_ONCE(task->__state) & __TASK_TRACED) != 0) - #define task_is_stopped(task) ((READ_ONCE(task->__state) & __TASK_STOPPED) != 0) -#define task_is_stopped_or_traced(task) ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0) - /* * Special states are those that do not use the normal wait-loop pattern. See * the comment with set_special_state(). @@ -1084,6 +1080,10 @@ struct task_struct { /* Restored if set_restore_sigmask() was used: */ sigset_t saved_sigmask; struct sigpending pending; +#ifdef CONFIG_PREEMPT_RT + /* TODO: move me into ->restart_block ? */ + struct kernel_siginfo forced_info; +#endif unsigned long sas_ss_sp; size_t sas_ss_size; unsigned int sas_ss_flags; @@ -1738,6 +1738,16 @@ static __always_inline bool is_percpu_thread(void) #endif } +/* Is the current task guaranteed to stay on its current CPU? */ +static inline bool is_migratable(void) +{ +#ifdef CONFIG_SMP + return preemptible() && !current->migration_disabled; +#else + return false; +#endif +} + /* Per-process atomic flags. */ #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ @@ -2013,6 +2023,118 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } +#ifdef CONFIG_PREEMPT_LAZY +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk) +{ + set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY); +} + +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) +{ + clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY); +} + +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk) +{ + return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY)); +} + +static inline int need_resched_lazy(void) +{ + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +} + +static inline int need_resched_now(void) +{ + return test_thread_flag(TIF_NEED_RESCHED); +} + +#else +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { } +static inline int need_resched_lazy(void) { return 0; } + +static inline int need_resched_now(void) +{ + return test_thread_flag(TIF_NEED_RESCHED); +} + +#endif + +#ifdef CONFIG_PREEMPT_RT +static inline bool task_match_saved_state(struct task_struct *p, long match_state) +{ + return p->saved_state == match_state; +} + +static inline bool task_is_traced(struct task_struct *task) +{ + bool traced = false; + + /* in case the task is sleeping on tasklist_lock */ + raw_spin_lock_irq(&task->pi_lock); + if (READ_ONCE(task->__state) & __TASK_TRACED) + traced = true; + else if (task->saved_state & __TASK_TRACED) + traced = true; + raw_spin_unlock_irq(&task->pi_lock); + return traced; +} + +static inline bool task_is_stopped_or_traced(struct task_struct *task) +{ + bool traced_stopped = false; + unsigned long flags; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + + if (READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) + traced_stopped = true; + else if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED)) + traced_stopped = true; + + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + return traced_stopped; +} + +#else + +static inline bool task_match_saved_state(struct task_struct *p, long match_state) +{ + return false; +} + +static inline bool task_is_traced(struct task_struct *task) +{ + return READ_ONCE(task->__state) & __TASK_TRACED; +} + +static inline bool task_is_stopped_or_traced(struct task_struct *task) +{ + return READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED); +} +#endif + +static inline bool task_match_state_or_saved(struct task_struct *p, + long match_state) +{ + if (READ_ONCE(p->__state) == match_state) + return true; + + return task_match_saved_state(p, match_state); +} + +static inline bool task_match_state_lock(struct task_struct *p, + long match_state) +{ + bool match; + + raw_spin_lock_irq(&p->pi_lock); + match = task_match_state_or_saved(p, match_state); + raw_spin_unlock_irq(&p->pi_lock); + + return match; +} + /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return @@ -2047,7 +2169,7 @@ static inline int _cond_resched(void) { return 0; } #endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */ #define cond_resched() ({ \ - ___might_sleep(__FILE__, __LINE__, 0); \ + __might_resched(__FILE__, __LINE__, 0); \ _cond_resched(); \ }) @@ -2055,19 +2177,38 @@ extern int __cond_resched_lock(spinlock_t *lock); extern int __cond_resched_rwlock_read(rwlock_t *lock); extern int __cond_resched_rwlock_write(rwlock_t *lock); -#define cond_resched_lock(lock) ({ \ - ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\ - __cond_resched_lock(lock); \ +#define MIGHT_RESCHED_RCU_SHIFT 8 +#define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1) + +#ifndef CONFIG_PREEMPT_RT +/* + * Non RT kernels have an elevated preempt count due to the held lock, + * but are not allowed to be inside a RCU read side critical section + */ +# define PREEMPT_LOCK_RESCHED_OFFSETS PREEMPT_LOCK_OFFSET +#else +/* + * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in + * cond_resched*lock() has to take that into account because it checks for + * preempt_count() and rcu_preempt_depth(). + */ +# define PREEMPT_LOCK_RESCHED_OFFSETS \ + (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT)) +#endif + +#define cond_resched_lock(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ + __cond_resched_lock(lock); \ }) -#define cond_resched_rwlock_read(lock) ({ \ - __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ - __cond_resched_rwlock_read(lock); \ +#define cond_resched_rwlock_read(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ + __cond_resched_rwlock_read(lock); \ }) -#define cond_resched_rwlock_write(lock) ({ \ - __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ - __cond_resched_rwlock_write(lock); \ +#define cond_resched_rwlock_write(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ + __cond_resched_rwlock_write(lock); \ }) static inline void cond_resched_rcu(void) diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 95fb7aaaec8d..28e9cc60f47e 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -49,6 +49,26 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } +#ifdef CONFIG_PREEMPT_RT +extern void __mmdrop_delayed(struct rcu_head *rhp); + +/* + * Invoked from finish_task_switch(). Delegates the heavy lifting on RT + * kernels via RCU. + */ +static inline void mmdrop_sched(struct mm_struct *mm) +{ + /* Provides a full memory barrier. See mmdrop() */ + if (atomic_dec_and_test(&mm->mm_count)) + call_rcu(&mm->delayed_drop, __mmdrop_delayed); +} +#else +static inline void mmdrop_sched(struct mm_struct *mm) +{ + mmdrop(mm); +} +#endif + /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index 5db211f43b29..aa011f668705 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -7,6 +7,7 @@ #ifndef _LINUX_SERIAL_8250_H #define _LINUX_SERIAL_8250_H +#include #include #include #include @@ -125,6 +126,8 @@ struct uart_8250_port { #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA unsigned char msr_saved_flags; + atomic_t console_printing; + struct uart_8250_dma *dma; const struct uart_8250_ops *ops; @@ -180,6 +183,8 @@ void serial8250_init_port(struct uart_8250_port *up); void serial8250_set_defaults(struct uart_8250_port *up); void serial8250_console_write(struct uart_8250_port *up, const char *s, unsigned int count); +void serial8250_console_write_atomic(struct uart_8250_port *up, const char *s, + unsigned int count); int serial8250_console_setup(struct uart_port *port, char *options, bool probe); int serial8250_console_exit(struct uart_port *port); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e213acaa91ec..d8bc89ee46e3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -300,6 +300,7 @@ struct sk_buff_head { __u32 qlen; spinlock_t lock; + raw_spinlock_t raw_lock; }; struct sk_buff; @@ -1945,6 +1946,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list) __skb_queue_head_init(list); } +static inline void skb_queue_head_init_raw(struct sk_buff_head *list) +{ + raw_spin_lock_init(&list->raw_lock); + __skb_queue_head_init(list); +} + static inline void skb_queue_head_init_class(struct sk_buff_head *list, struct lock_class_key *class) { diff --git a/include/linux/smp.h b/include/linux/smp.h index 510519e8a1eb..7ac9fdb5ad09 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -268,6 +268,9 @@ static inline int get_boot_cpu_id(void) #define get_cpu() ({ preempt_disable(); __smp_processor_id(); }) #define put_cpu() preempt_enable() +#define get_cpu_light() ({ migrate_disable(); __smp_processor_id(); }) +#define put_cpu_light() migrate_enable() + /* * Callback to arch code if there's nosmp or maxcpus=0 on the * boot command line: diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h index c09b6407ae1b..7f86a2016ac5 100644 --- a/include/linux/spinlock_types_up.h +++ b/include/linux/spinlock_types_up.h @@ -1,7 +1,7 @@ #ifndef __LINUX_SPINLOCK_TYPES_UP_H #define __LINUX_SPINLOCK_TYPES_UP_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 4bcd65679cee..4cd3bc5d3891 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -541,23 +541,17 @@ static inline void unlock_system_sleep(void) {} #ifdef CONFIG_PM_SLEEP_DEBUG extern bool pm_print_times_enabled; extern bool pm_debug_messages_on; -extern __printf(2, 3) void __pm_pr_dbg(bool defer, const char *fmt, ...); +extern __printf(1, 2) void pm_pr_dbg(const char *fmt, ...); #else #define pm_print_times_enabled (false) #define pm_debug_messages_on (false) #include -#define __pm_pr_dbg(defer, fmt, ...) \ +#define pm_pr_dbg(fmt, ...) \ no_printk(KERN_DEBUG fmt, ##__VA_ARGS__) #endif -#define pm_pr_dbg(fmt, ...) \ - __pm_pr_dbg(false, fmt, ##__VA_ARGS__) - -#define pm_deferred_pr_dbg(fmt, ...) \ - __pm_pr_dbg(true, fmt, ##__VA_ARGS__) - #ifdef CONFIG_PM_AUTOSLEEP /* kernel/power/autosleep.c */ diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 0999f6317978..7af834b7c114 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -163,7 +163,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag) clear_ti_thread_flag(task_thread_info(t), TIF_##fl) #endif /* !CONFIG_GENERIC_ENTRY */ -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) +#ifdef CONFIG_PREEMPT_LAZY +#define tif_need_resched() (test_thread_flag(TIF_NEED_RESCHED) || \ + test_thread_flag(TIF_NEED_RESCHED_LAZY)) +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED)) +#define tif_need_resched_lazy() test_thread_flag(TIF_NEED_RESCHED_LAZY) + +#else +#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED) +#define tif_need_resched_lazy() 0 +#endif #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES static inline int arch_within_stack_frames(const void * const stack, diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 57113190448c..827725f41149 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -69,6 +69,7 @@ struct trace_entry { unsigned char flags; unsigned char preempt_count; int pid; + unsigned char preempt_lazy_count; }; #define TRACE_EVENT_TYPE_MAX \ @@ -157,9 +158,10 @@ static inline void tracing_generic_entry_update(struct trace_entry *entry, unsigned int trace_ctx) { entry->preempt_count = trace_ctx & 0xff; + entry->preempt_lazy_count = (trace_ctx >> 16) & 0xff; entry->pid = current->pid; entry->type = type; - entry->flags = trace_ctx >> 16; + entry->flags = trace_ctx >> 24; } unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); @@ -172,6 +174,7 @@ enum trace_flag_type { TRACE_FLAG_SOFTIRQ = 0x10, TRACE_FLAG_PREEMPT_RESCHED = 0x20, TRACE_FLAG_NMI = 0x40, + TRACE_FLAG_NEED_RESCHED_LAZY = 0x80, }; #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index e81856c0ba13..81dc1f5e181a 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -66,7 +66,7 @@ #include struct u64_stats_sync { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG==32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) seqcount_t seq; #endif }; @@ -83,6 +83,11 @@ static inline u64 u64_stats_read(const u64_stats_t *p) return local64_read(&p->v); } +static inline void u64_stats_set(u64_stats_t *p, u64 val) +{ + local64_set(&p->v, val); +} + static inline void u64_stats_add(u64_stats_t *p, unsigned long val) { local64_add(val, &p->v); @@ -104,6 +109,11 @@ static inline u64 u64_stats_read(const u64_stats_t *p) return p->v; } +static inline void u64_stats_set(u64_stats_t *p, u64 val) +{ + p->v = val; +} + static inline void u64_stats_add(u64_stats_t *p, unsigned long val) { p->v += val; @@ -115,7 +125,7 @@ static inline void u64_stats_inc(u64_stats_t *p) } #endif -#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) #define u64_stats_init(syncp) seqcount_init(&(syncp)->seq) #else static inline void u64_stats_init(struct u64_stats_sync *syncp) @@ -125,15 +135,19 @@ static inline void u64_stats_init(struct u64_stats_sync *syncp) static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); write_seqcount_begin(&syncp->seq); #endif } static inline void u64_stats_update_end(struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) write_seqcount_end(&syncp->seq); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); #endif } @@ -142,8 +156,11 @@ u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) { unsigned long flags = 0; -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - local_irq_save(flags); +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + else + local_irq_save(flags); write_seqcount_begin(&syncp->seq); #endif return flags; @@ -153,15 +170,18 @@ static inline void u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, unsigned long flags) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) write_seqcount_end(&syncp->seq); - local_irq_restore(flags); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); + else + local_irq_restore(flags); #endif } static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) return read_seqcount_begin(&syncp->seq); #else return 0; @@ -170,7 +190,7 @@ static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync * static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) preempt_disable(); #endif return __u64_stats_fetch_begin(syncp); @@ -179,7 +199,7 @@ static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *sy static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) return read_seqcount_retry(&syncp->seq, start); #else return false; @@ -189,7 +209,7 @@ static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) preempt_enable(); #endif return __u64_stats_fetch_retry(syncp, start); @@ -203,7 +223,9 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, */ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) + preempt_disable(); +#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) local_irq_disable(); #endif return __u64_stats_fetch_begin(syncp); @@ -212,7 +234,9 @@ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) + preempt_enable(); +#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) local_irq_enable(); #endif return __u64_stats_fetch_retry(syncp, start); diff --git a/include/net/act_api.h b/include/net/act_api.h index f19f7f4a463c..b5b624c7e488 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -30,13 +30,13 @@ struct tc_action { atomic_t tcfa_bindcnt; int tcfa_action; struct tcf_t tcfa_tm; - struct gnet_stats_basic_packed tcfa_bstats; - struct gnet_stats_basic_packed tcfa_bstats_hw; + struct gnet_stats_basic_sync tcfa_bstats; + struct gnet_stats_basic_sync tcfa_bstats_hw; struct gnet_stats_queue tcfa_qstats; struct net_rate_estimator __rcu *tcfa_rate_est; spinlock_t tcfa_lock; - struct gnet_stats_basic_cpu __percpu *cpu_bstats; - struct gnet_stats_basic_cpu __percpu *cpu_bstats_hw; + struct gnet_stats_basic_sync __percpu *cpu_bstats; + struct gnet_stats_basic_sync __percpu *cpu_bstats_hw; struct gnet_stats_queue __percpu *cpu_qstats; struct tc_cookie __rcu *act_cookie; struct tcf_chain __rcu *goto_chain; @@ -206,7 +206,7 @@ static inline void tcf_action_update_bstats(struct tc_action *a, struct sk_buff *skb) { if (likely(a->cpu_bstats)) { - bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb); + bstats_update(this_cpu_ptr(a->cpu_bstats), skb); return; } spin_lock(&a->tcfa_lock); diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index 1424e02cef90..7aa2b8e1fb29 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -7,14 +7,17 @@ #include #include -/* Note: this used to be in include/uapi/linux/gen_stats.h */ -struct gnet_stats_basic_packed { - __u64 bytes; - __u64 packets; -}; - -struct gnet_stats_basic_cpu { - struct gnet_stats_basic_packed bstats; +/* Throughput stats. + * Must be initialized beforehand with gnet_stats_basic_sync_init(). + * + * If no reads can ever occur parallel to writes (e.g. stack-allocated + * bstats), then the internal stat values can be written to and read + * from directly. Otherwise, use _bstats_set/update() for writes and + * gnet_stats_add_basic() for reads. + */ +struct gnet_stats_basic_sync { + u64_stats_t bytes; + u64_stats_t packets; struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); @@ -34,6 +37,7 @@ struct gnet_dump { struct tc_stats tc_stats; }; +void gnet_stats_basic_sync_init(struct gnet_stats_basic_sync *b); int gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, struct gnet_dump *d, int padattr); @@ -42,41 +46,38 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, spinlock_t *lock, struct gnet_dump *d, int padattr); -int gnet_stats_copy_basic(const seqcount_t *running, - struct gnet_dump *d, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b); -void __gnet_stats_copy_basic(const seqcount_t *running, - struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b); -int gnet_stats_copy_basic_hw(const seqcount_t *running, - struct gnet_dump *d, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b); +int gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running); +void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running); +int gnet_stats_copy_basic_hw(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running); int gnet_stats_copy_rate_est(struct gnet_dump *d, struct net_rate_estimator __rcu **ptr); int gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue __percpu *cpu_q, struct gnet_stats_queue *q, __u32 qlen); -void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, - const struct gnet_stats_queue __percpu *cpu_q, - const struct gnet_stats_queue *q, __u32 qlen); +void gnet_stats_add_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu_q, + const struct gnet_stats_queue *q); int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); int gnet_stats_finish_copy(struct gnet_dump *d); -int gen_new_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu_bstats, +int gen_new_estimator(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, - seqcount_t *running, struct nlattr *opt); + bool running, struct nlattr *opt); void gen_kill_estimator(struct net_rate_estimator __rcu **ptr); -int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu_bstats, +int gen_replace_estimator(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu_bstats, struct net_rate_estimator __rcu **ptr, spinlock_t *lock, - seqcount_t *running, struct nlattr *opt); + bool running, struct nlattr *opt); bool gen_estimator_active(struct net_rate_estimator __rcu **ptr); bool gen_estimator_read(struct net_rate_estimator __rcu **ptr, struct gnet_stats_rate_est64 *sample); diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h index 832ab69efda5..4c3809e141f4 100644 --- a/include/net/netfilter/xt_rateest.h +++ b/include/net/netfilter/xt_rateest.h @@ -6,7 +6,7 @@ struct xt_rateest { /* keep lock and bstats on same cache line to speedup xt_rateest_tg() */ - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; spinlock_t lock; diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 83a6d0792180..4a5833108083 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -765,7 +765,7 @@ struct tc_cookie { }; struct tc_qopt_offload_stats { - struct gnet_stats_basic_packed *bstats; + struct gnet_stats_basic_sync *bstats; struct gnet_stats_queue *qstats; }; @@ -885,7 +885,7 @@ struct tc_gred_qopt_offload_params { }; struct tc_gred_qopt_offload_stats { - struct gnet_stats_basic_packed bstats[MAX_DPs]; + struct gnet_stats_basic_sync bstats[MAX_DPs]; struct gnet_stats_queue qstats[MAX_DPs]; struct red_stats *xstats[MAX_DPs]; }; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 1958d1260fe9..e43f3922faa7 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -40,6 +40,13 @@ enum qdisc_state_t { __QDISC_STATE_DRAINING, }; +enum qdisc_state2_t { + /* Only for !TCQ_F_NOLOCK qdisc. Never access it directly. + * Use qdisc_run_begin/end() or qdisc_is_running() instead. + */ + __QDISC_STATE2_RUNNING, +}; + #define QDISC_STATE_MISSED BIT(__QDISC_STATE_MISSED) #define QDISC_STATE_DRAINING BIT(__QDISC_STATE_DRAINING) @@ -97,7 +104,7 @@ struct Qdisc { struct netdev_queue *dev_queue; struct net_rate_estimator __rcu *rate_est; - struct gnet_stats_basic_cpu __percpu *cpu_bstats; + struct gnet_stats_basic_sync __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; int pad; refcount_t refcnt; @@ -107,10 +114,10 @@ struct Qdisc { */ struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; struct qdisc_skb_head q; - struct gnet_stats_basic_packed bstats; - seqcount_t running; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; unsigned long state; + unsigned long state2; /* must be written under qdisc spinlock */ struct Qdisc *next_sched; struct sk_buff_head skb_bad_txq; @@ -143,11 +150,15 @@ static inline struct Qdisc *qdisc_refcount_inc_nz(struct Qdisc *qdisc) return NULL; } +/* For !TCQ_F_NOLOCK qdisc: callers must either call this within a qdisc + * root_lock section, or provide their own memory barriers -- ordering + * against qdisc_run_begin/end() atomic bit operations. + */ static inline bool qdisc_is_running(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) return spin_is_locked(&qdisc->seqlock); - return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; + return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); } static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc) @@ -167,6 +178,9 @@ static inline bool qdisc_is_empty(const struct Qdisc *qdisc) return !READ_ONCE(qdisc->q.qlen); } +/* For !TCQ_F_NOLOCK qdisc, qdisc_run_begin/end() must be invoked with + * the qdisc root lock acquired. + */ static inline bool qdisc_run_begin(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) { @@ -186,15 +200,8 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) * when testing it in qdisc_run_end() */ return spin_trylock(&qdisc->seqlock); - } else if (qdisc_is_running(qdisc)) { - return false; } - /* Variant of write_seqcount_begin() telling lockdep a trylock - * was attempted. - */ - raw_write_seqcount_begin(&qdisc->running); - seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_); - return true; + return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); } static inline void qdisc_run_end(struct Qdisc *qdisc) @@ -212,7 +219,7 @@ static inline void qdisc_run_end(struct Qdisc *qdisc) &qdisc->state))) __netif_schedule(qdisc); } else { - write_seqcount_end(&qdisc->running); + __clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); } } @@ -576,14 +583,6 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc) return qdisc_lock(root); } -static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc) -{ - struct Qdisc *root = qdisc_root_sleeping(qdisc); - - ASSERT_RTNL(); - return &root->running; -} - static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc) { return qdisc->dev_queue->dev; @@ -833,14 +832,16 @@ static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, return sch->enqueue(skb, sch, to_free); } -static inline void _bstats_update(struct gnet_stats_basic_packed *bstats, +static inline void _bstats_update(struct gnet_stats_basic_sync *bstats, __u64 bytes, __u32 packets) { - bstats->bytes += bytes; - bstats->packets += packets; + u64_stats_update_begin(&bstats->syncp); + u64_stats_add(&bstats->bytes, bytes); + u64_stats_add(&bstats->packets, packets); + u64_stats_update_end(&bstats->syncp); } -static inline void bstats_update(struct gnet_stats_basic_packed *bstats, +static inline void bstats_update(struct gnet_stats_basic_sync *bstats, const struct sk_buff *skb) { _bstats_update(bstats, @@ -848,26 +849,10 @@ static inline void bstats_update(struct gnet_stats_basic_packed *bstats, skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1); } -static inline void _bstats_cpu_update(struct gnet_stats_basic_cpu *bstats, - __u64 bytes, __u32 packets) -{ - u64_stats_update_begin(&bstats->syncp); - _bstats_update(&bstats->bstats, bytes, packets); - u64_stats_update_end(&bstats->syncp); -} - -static inline void bstats_cpu_update(struct gnet_stats_basic_cpu *bstats, - const struct sk_buff *skb) -{ - u64_stats_update_begin(&bstats->syncp); - bstats_update(&bstats->bstats, skb); - u64_stats_update_end(&bstats->syncp); -} - static inline void qdisc_bstats_cpu_update(struct Qdisc *sch, const struct sk_buff *skb) { - bstats_cpu_update(this_cpu_ptr(sch->cpu_bstats), skb); + bstats_update(this_cpu_ptr(sch->cpu_bstats), skb); } static inline void qdisc_bstats_update(struct Qdisc *sch, @@ -956,10 +941,9 @@ static inline void qdisc_qstats_qlen_backlog(struct Qdisc *sch, __u32 *qlen, __u32 *backlog) { struct gnet_stats_queue qstats = { 0 }; - __u32 len = qdisc_qlen_sum(sch); - __gnet_stats_copy_queue(&qstats, sch->cpu_qstats, &sch->qstats, len); - *qlen = qstats.qlen; + gnet_stats_add_queue(&qstats, sch->cpu_qstats, &sch->qstats); + *qlen = qstats.qlen + qdisc_qlen(sch); *backlog = qstats.backlog; } @@ -1305,7 +1289,7 @@ void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64); struct mini_Qdisc { struct tcf_proto *filter_list; struct tcf_block *block; - struct gnet_stats_basic_cpu __percpu *cpu_bstats; + struct gnet_stats_basic_sync __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; struct rcu_head rcu; }; @@ -1313,7 +1297,7 @@ struct mini_Qdisc { static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq, const struct sk_buff *skb) { - bstats_cpu_update(this_cpu_ptr(miniq->cpu_bstats), skb); + bstats_update(this_cpu_ptr(miniq->cpu_bstats), skb); } static inline void mini_qdisc_qstats_cpu_drop(struct mini_Qdisc *miniq) diff --git a/init/Kconfig b/init/Kconfig index d19ed66aba3b..160f836f81c7 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -906,7 +906,7 @@ config NUMA_BALANCING bool "Memory placement aware NUMA scheduler" depends on ARCH_SUPPORTS_NUMA_BALANCING depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION + depends on SMP && NUMA && MIGRATION && !PREEMPT_RT help This option adds support for automatic NUMA aware memory/task placement. The mechanism is quite primitive and is based on migrating memory when @@ -943,6 +943,7 @@ config PAGE_COUNTER config MEMCG bool "Memory controller" + depends on !PREEMPT_RT select PAGE_COUNTER select EVENTFD help @@ -1901,6 +1902,7 @@ choice config SLAB bool "SLAB" + depends on !PREEMPT_RT select HAVE_HARDENED_USERCOPY_ALLOCATOR help The regular slab allocator that is established and known to work @@ -1921,6 +1923,7 @@ config SLUB config SLOB depends on EXPERT bool "SLOB (Simple Allocator)" + depends on !PREEMPT_RT help SLOB replaces the stock allocator with a drastically simpler allocator. SLOB is generally more space efficient but diff --git a/init/main.c b/init/main.c index cf79b5a766cb..500a40b705e9 100644 --- a/init/main.c +++ b/init/main.c @@ -1605,6 +1605,7 @@ static noinline void __init kernel_init_freeable(void) rcu_init_tasks_generic(); do_pre_smp_initcalls(); + rcu_tasks_initiate_self_tests(); lockup_detector_init(); smp_init(); diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 5876e30c5740..5df0776264c2 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -1,5 +1,11 @@ # SPDX-License-Identifier: GPL-2.0-only +config HAVE_PREEMPT_LAZY + bool + +config PREEMPT_LAZY + def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT + choice prompt "Preemption Model" default PREEMPT_NONE diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 1486768f2318..bb3b805436c4 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -156,8 +156,9 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); struct cgroup *pos = NULL; + unsigned long flags; - raw_spin_lock(cpu_lock); + raw_spin_lock_irqsave(cpu_lock, flags); while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { struct cgroup_subsys_state *css; @@ -169,7 +170,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) css->ss->css_rstat_flush(css, cpu); rcu_read_unlock(); } - raw_spin_unlock(cpu_lock); + raw_spin_unlock_irqrestore(cpu_lock, flags); /* if @may_sleep, play nice and yield if necessary */ if (may_sleep && (need_resched() || diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 7beceb447211..28497c00e63b 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -239,35 +239,42 @@ NOKPROBE_SYMBOL(kgdb_call_nmi_hook); static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd) = CSD_INIT(kgdb_call_nmi_hook, NULL); -void __weak kgdb_roundup_cpus(void) +void __weak kgdb_roundup_cpu(unsigned int cpu) { call_single_data_t *csd; + int ret; + + csd = &per_cpu(kgdb_roundup_csd, cpu); + + /* + * If it didn't round up last time, don't try again + * since smp_call_function_single_async() will block. + * + * If rounding_up is false then we know that the + * previous call must have at least started and that + * means smp_call_function_single_async() won't block. + */ + if (kgdb_info[cpu].rounding_up) + return; + kgdb_info[cpu].rounding_up = true; + + ret = smp_call_function_single_async(cpu, csd); + if (ret) + kgdb_info[cpu].rounding_up = false; +} +NOKPROBE_SYMBOL(kgdb_roundup_cpu); + +void __weak kgdb_roundup_cpus(void) +{ int this_cpu = raw_smp_processor_id(); int cpu; - int ret; for_each_online_cpu(cpu) { /* No need to roundup ourselves */ if (cpu == this_cpu) continue; - csd = &per_cpu(kgdb_roundup_csd, cpu); - - /* - * If it didn't round up last time, don't try again - * since smp_call_function_single_async() will block. - * - * If rounding_up is false then we know that the - * previous call must have at least started and that - * means smp_call_function_single_async() won't block. - */ - if (kgdb_info[cpu].rounding_up) - continue; - kgdb_info[cpu].rounding_up = true; - - ret = smp_call_function_single_async(cpu, csd); - if (ret) - kgdb_info[cpu].rounding_up = false; + kgdb_roundup_cpu(cpu); } } NOKPROBE_SYMBOL(kgdb_roundup_cpus); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 6735ac36b718..539a2f0dc89d 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -559,23 +559,17 @@ static void kdb_msg_write(const char *msg, int msg_len) cp++; } + /* mirror output on atomic consoles */ for_each_console(c) { if (!(c->flags & CON_ENABLED)) continue; if (c == dbg_io_ops->cons) continue; - /* - * Set oops_in_progress to encourage the console drivers to - * disregard their internal spin locks: in the current calling - * context the risk of deadlock is a bigger problem than risks - * due to re-entering the console driver. We operate directly on - * oops_in_progress rather than using bust_spinlocks() because - * the calls bust_spinlocks() makes on exit are not appropriate - * for this calling context. - */ - ++oops_in_progress; - c->write(c, msg, msg_len); - --oops_in_progress; + + if (!c->write_atomic) + continue; + c->write_atomic(c, msg, msg_len); + touch_nmi_watchdog(); } } diff --git a/kernel/entry/common.c b/kernel/entry/common.c index d5a61d565ad5..a9579f8bf4f0 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -159,9 +159,17 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, local_irq_enable_exit_to_user(ti_work); - if (ti_work & _TIF_NEED_RESCHED) + if (ti_work & _TIF_NEED_RESCHED_MASK) schedule(); +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND + if (unlikely(current->forced_info.si_signo)) { + struct task_struct *t = current; + force_sig_info(&t->forced_info); + t->forced_info.si_signo = 0; + } +#endif + if (ti_work & _TIF_UPROBE) uprobe_notify_resume(regs); @@ -387,7 +395,7 @@ void irqentry_exit_cond_resched(void) rcu_irq_exit_check_preempt(); if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) WARN_ON_ONCE(!on_thread_stack()); - if (need_resched()) + if (should_resched(0)) preempt_schedule_irq(); } } diff --git a/kernel/exit.c b/kernel/exit.c index 91a43e57a32e..1d099609568d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include @@ -168,8 +169,14 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + kprobe_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); + + /* RT enabled kernels delay freeing the VMAP'ed task stack */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + put_task_stack(tsk); + put_task_struct(tsk); } diff --git a/kernel/fork.c b/kernel/fork.c index 89475c994ca9..dc1aa0f71089 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -289,7 +289,10 @@ static inline void free_thread_stack(struct task_struct *tsk) return; } - vfree_atomic(tsk->stack); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + vfree_atomic(tsk->stack); + else + vfree(tsk->stack); return; } #endif @@ -705,6 +708,19 @@ void __mmdrop(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(__mmdrop); +#ifdef CONFIG_PREEMPT_RT +/* + * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is + * by far the least expensive way to do that. + */ +void __mmdrop_delayed(struct rcu_head *rhp) +{ + struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); + + __mmdrop(mm); +} +#endif + static void mmdrop_async_fn(struct work_struct *work) { struct mm_struct *mm; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 21b3ac2a29d2..26f343228230 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -661,6 +661,29 @@ int generic_handle_irq(unsigned int irq) } EXPORT_SYMBOL_GPL(generic_handle_irq); +/** + * generic_handle_irq_safe - Invoke the handler for a particular irq from any + * context. + * @irq: The irq number to handle + * + * Returns: 0 on success, a negative value on error. + * + * This function can be called from any context (IRQ or process context). It + * will report an error if not invoked from IRQ context and the irq has been + * marked to enforce IRQ-context only. + */ +int generic_handle_irq_safe(unsigned int irq) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = handle_irq_desc(irq_to_desc(irq)); + local_irq_restore(flags); + return ret; +} +EXPORT_SYMBOL_GPL(generic_handle_irq_safe); + #ifdef CONFIG_IRQ_DOMAIN /** * generic_handle_domain_irq - Invoke the handler for a HW irq belonging diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0c3c26fb054f..16372116f393 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1286,6 +1286,8 @@ static int irq_thread(void *data) irq_thread_set_ready(desc, action); + sched_set_fifo(current); + if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, &action->thread_flags)) handler_fn = irq_forced_thread_fn; @@ -1451,8 +1453,6 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) if (IS_ERR(t)) return PTR_ERR(t); - sched_set_fifo(t); - /* * We keep the reference to the task struct even if * the thread dies to avoid that the interrupt code @@ -2846,7 +2846,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state); * This call sets the internal irqchip state of an interrupt, * depending on the value of @which. * - * This function should be called with preemption disabled if the + * This function should be called with migration disabled if the * interrupt controller has per-cpu registers. */ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index c481d8458325..02b2daf07441 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -447,6 +447,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); static int __init irqfixup_setup(char *str) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + pr_warn("irqfixup boot option not supported with PREEMPT_RT\n"); + return 1; + } irqfixup = 1; printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); printk(KERN_WARNING "This may impact system performance.\n"); @@ -459,6 +463,10 @@ module_param(irqfixup, int, 0644); static int __init irqpoll_setup(char *str) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + pr_warn("irqpoll boot option not supported with PREEMPT_RT\n"); + return 1; + } irqfixup = 2; printk(KERN_WARNING "Misrouted IRQ fixup and polling support " "enabled\n"); diff --git a/kernel/irq_work.c b/kernel/irq_work.c index db8c248ebc8c..f7df715ec28e 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -18,11 +18,36 @@ #include #include #include +#include #include #include static DEFINE_PER_CPU(struct llist_head, raised_list); static DEFINE_PER_CPU(struct llist_head, lazy_list); +static DEFINE_PER_CPU(struct task_struct *, irq_workd); + +static void wake_irq_workd(void) +{ + struct task_struct *tsk = __this_cpu_read(irq_workd); + + if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk) + wake_up_process(tsk); +} + +#ifdef CONFIG_SMP +static void irq_work_wake(struct irq_work *entry) +{ + wake_irq_workd(); +} + +static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) = + IRQ_WORK_INIT_HARD(irq_work_wake); +#endif + +static int irq_workd_should_run(unsigned int cpu) +{ + return !llist_empty(this_cpu_ptr(&lazy_list)); +} /* * Claim the entry so that no one else will poke at it. @@ -52,15 +77,29 @@ void __weak arch_irq_work_raise(void) /* Enqueue on current CPU, work must already be claimed and preempt disabled */ static void __irq_work_queue_local(struct irq_work *work) { + struct llist_head *list; + bool rt_lazy_work = false; + bool lazy_work = false; + int work_flags; + + work_flags = atomic_read(&work->node.a_flags); + if (work_flags & IRQ_WORK_LAZY) + lazy_work = true; + else if (IS_ENABLED(CONFIG_PREEMPT_RT) && + !(work_flags & IRQ_WORK_HARD_IRQ)) + rt_lazy_work = true; + + if (lazy_work || rt_lazy_work) + list = this_cpu_ptr(&lazy_list); + else + list = this_cpu_ptr(&raised_list); + + if (!llist_add(&work->node.llist, list)) + return; + /* If the work is "lazy", handle it from next tick if any */ - if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) { - if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) && - tick_nohz_tick_stopped()) - arch_irq_work_raise(); - } else { - if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list))) - arch_irq_work_raise(); - } + if (!lazy_work || tick_nohz_tick_stopped()) + arch_irq_work_raise(); } /* Enqueue the irq work @work on the current CPU */ @@ -104,17 +143,34 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); + + /* + * On PREEMPT_RT the items which are not marked as + * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work + * item is used on the remote CPU to wake the thread. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && + !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) { + + if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu))) + goto out; + + work = &per_cpu(irq_work_wakeup, cpu); + if (!irq_work_claim(work)) + goto out; + } + __smp_call_single_queue(cpu, &work->node.llist); } else { __irq_work_queue_local(work); } +out: preempt_enable(); return true; #endif /* CONFIG_SMP */ } - bool irq_work_needs_cpu(void) { struct llist_head *raised, *lazy; @@ -160,6 +216,10 @@ void irq_work_single(void *arg) * else claimed it meanwhile. */ (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); + + if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || + !arch_irq_work_has_interrupt()) + rcuwait_wake_up(&work->irqwait); } static void irq_work_run_list(struct llist_head *list) @@ -167,7 +227,12 @@ static void irq_work_run_list(struct llist_head *list) struct irq_work *work, *tmp; struct llist_node *llnode; - BUG_ON(!irqs_disabled()); + /* + * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed + * in a per-CPU thread in preemptible context. Only the items which are + * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context. + */ + BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT)); if (llist_empty(list)) return; @@ -184,7 +249,10 @@ static void irq_work_run_list(struct llist_head *list) void irq_work_run(void) { irq_work_run_list(this_cpu_ptr(&raised_list)); - irq_work_run_list(this_cpu_ptr(&lazy_list)); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + irq_work_run_list(this_cpu_ptr(&lazy_list)); + else + wake_irq_workd(); } EXPORT_SYMBOL_GPL(irq_work_run); @@ -194,7 +262,11 @@ void irq_work_tick(void) if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) irq_work_run_list(raised); - irq_work_run_list(this_cpu_ptr(&lazy_list)); + + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + irq_work_run_list(this_cpu_ptr(&lazy_list)); + else + wake_irq_workd(); } /* @@ -204,8 +276,42 @@ void irq_work_tick(void) void irq_work_sync(struct irq_work *work) { lockdep_assert_irqs_enabled(); + might_sleep(); + + if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || + !arch_irq_work_has_interrupt()) { + rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), + TASK_UNINTERRUPTIBLE); + return; + } while (irq_work_is_busy(work)) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); + +static void run_irq_workd(unsigned int cpu) +{ + irq_work_run_list(this_cpu_ptr(&lazy_list)); +} + +static void irq_workd_setup(unsigned int cpu) +{ + sched_set_fifo_low(current); +} + +static struct smp_hotplug_thread irqwork_threads = { + .store = &irq_workd, + .setup = irq_workd_setup, + .thread_should_run = irq_workd_should_run, + .thread_fn = run_irq_workd, + .thread_comm = "irq_work/%u", +}; + +static __init int irq_work_init_threads(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + BUG_ON(smpboot_register_percpu_thread(&irqwork_threads)); + return 0; +} +early_initcall(irq_work_init_threads); diff --git a/kernel/kcov.c b/kernel/kcov.c index 80bfe71bbe13..36ca640c4f8e 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -88,6 +88,7 @@ static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas); struct kcov_percpu_data { void *irq_area; + local_lock_t lock; unsigned int saved_mode; unsigned int saved_size; @@ -96,7 +97,9 @@ struct kcov_percpu_data { int saved_sequence; }; -static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); +static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data) = { + .lock = INIT_LOCAL_LOCK(lock), +}; /* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_find(u64 handle) @@ -824,7 +827,7 @@ void kcov_remote_start(u64 handle) if (!in_task() && !in_serving_softirq()) return; - local_irq_save(flags); + local_lock_irqsave(&kcov_percpu_data.lock, flags); /* * Check that kcov_remote_start() is not called twice in background @@ -832,7 +835,7 @@ void kcov_remote_start(u64 handle) */ mode = READ_ONCE(t->kcov_mode); if (WARN_ON(in_task() && kcov_mode_enabled(mode))) { - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); return; } /* @@ -841,14 +844,15 @@ void kcov_remote_start(u64 handle) * happened while collecting coverage from a background thread. */ if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) { - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); return; } spin_lock(&kcov_remote_lock); remote = kcov_remote_find(handle); if (!remote) { - spin_unlock_irqrestore(&kcov_remote_lock, flags); + spin_unlock(&kcov_remote_lock); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); return; } kcov_debug("handle = %llx, context: %s\n", handle, @@ -869,19 +873,19 @@ void kcov_remote_start(u64 handle) size = CONFIG_KCOV_IRQ_AREA_SIZE; area = this_cpu_ptr(&kcov_percpu_data)->irq_area; } - spin_unlock_irqrestore(&kcov_remote_lock, flags); + spin_unlock(&kcov_remote_lock); /* Can only happen when in_task(). */ if (!area) { + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); area = vmalloc(size * sizeof(unsigned long)); if (!area) { kcov_put(kcov); return; } + local_lock_irqsave(&kcov_percpu_data.lock, flags); } - local_irq_save(flags); - /* Reset coverage size. */ *(u64 *)area = 0; @@ -891,7 +895,7 @@ void kcov_remote_start(u64 handle) } kcov_start(t, kcov, size, area, mode, sequence); - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); } EXPORT_SYMBOL(kcov_remote_start); @@ -965,12 +969,12 @@ void kcov_remote_stop(void) if (!in_task() && !in_serving_softirq()) return; - local_irq_save(flags); + local_lock_irqsave(&kcov_percpu_data.lock, flags); mode = READ_ONCE(t->kcov_mode); barrier(); if (!kcov_mode_enabled(mode)) { - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); return; } /* @@ -978,12 +982,12 @@ void kcov_remote_stop(void) * actually found the remote handle and started collecting coverage. */ if (in_serving_softirq() && !t->kcov_softirq) { - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); return; } /* Make sure that kcov_softirq is only set when in softirq. */ if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) { - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); return; } @@ -1013,7 +1017,7 @@ void kcov_remote_stop(void) spin_unlock(&kcov_remote_lock); } - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); /* Get in kcov_remote_start(). */ kcov_put(kcov); @@ -1034,8 +1038,8 @@ static int __init kcov_init(void) int cpu; for_each_possible_cpu(cpu) { - void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE * - sizeof(unsigned long)); + void *area = vmalloc_node(CONFIG_KCOV_IRQ_AREA_SIZE * + sizeof(unsigned long), cpu_to_node(cpu)); if (!area) return -ENOMEM; per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2ef90d15699f..2ab883d856b5 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1250,10 +1250,10 @@ void kprobe_busy_end(void) } /* - * This function is called from finish_task_switch when task tk becomes dead, - * so that we can recycle any function-return probe instances associated - * with this task. These left over instances represent probed functions - * that have been called but will never return. + * This function is called from delayed_put_task_struct() when a task is + * dead and cleaned up to recycle any function-return probe instances + * associated with this task. These left over instances represent probed + * functions that have been called but will never return. */ void kprobe_flush_task(struct task_struct *tk) { diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 35859da8bd4f..dfff31ed644a 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -138,6 +138,15 @@ KERNEL_ATTR_RO(vmcoreinfo); #endif /* CONFIG_CRASH_CORE */ +#if defined(CONFIG_PREEMPT_RT) +static ssize_t realtime_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", 1); +} +KERNEL_ATTR_RO(realtime); +#endif + /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -228,6 +237,9 @@ static struct attribute * kernel_attrs[] = { #ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, &rcu_normal_attr.attr, +#endif +#ifdef CONFIG_PREEMPT_RT + &realtime_attr.attr, #endif NULL }; diff --git a/kernel/kthread.c b/kernel/kthread.c index 5b37a8567168..4a4d7092a2d8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -270,6 +270,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme); static int kthread(void *_create) { + static const struct sched_param param = { .sched_priority = 0 }; /* Copy data: it's on kthread's stack */ struct kthread_create_info *create = _create; int (*threadfn)(void *data) = create->threadfn; @@ -300,6 +301,13 @@ static int kthread(void *_create) init_completion(&self->parked); current->vfork_done = &self->exited; + /* + * The new thread inherited kthreadd's priority and CPU mask. Reset + * back to default in case they have been changed. + */ + sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); + set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD)); + /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; @@ -397,7 +405,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), } task = create->result; if (!IS_ERR(task)) { - static const struct sched_param param = { .sched_priority = 0 }; char name[TASK_COMM_LEN]; /* @@ -406,13 +413,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), */ vsnprintf(name, sizeof(name), namefmt, args); set_task_comm(task, name); - /* - * root may have changed our (kthreadd's) priority or CPU mask. - * The kernel thread should not inherit these properties. - */ - sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(task, - housekeeping_cpumask(HK_FLAG_KTHREAD)); } kfree(create); return task; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index a30702b847ba..3aa2a6df1f87 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5475,6 +5475,7 @@ static noinstr void check_flags(unsigned long flags) } } +#ifndef CONFIG_PREEMPT_RT /* * We dont accurately track softirq state in e.g. * hardirq contexts (such as on 4KSTACKS), so only @@ -5489,6 +5490,7 @@ static noinstr void check_flags(unsigned long flags) DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); } } +#endif if (!debug_locks) print_irqtrace_events(current); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index ea5a701ab240..547752d1e9c0 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1097,8 +1097,26 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, * which is wrong, as the other waiter is not in a deadlock * situation. */ - if (owner == task) + if (owner == task) { +#if defined(DEBUG_WW_MUTEXES) && defined(CONFIG_DEBUG_LOCKING_API_SELFTESTS) + /* + * The lockdep selftest for ww-mutex assumes in a few cases + * the ww_ctx->contending_lock assignment via + * __ww_mutex_check_kill() which does not happen if the rtmutex + * detects the deadlock early. + */ + if (build_ww_mutex() && ww_ctx) { + struct rt_mutex *rtm; + + /* Check whether the waiter should backout immediately */ + rtm = container_of(lock, struct rt_mutex, rtmutex); + + __ww_mutex_add_waiter(waiter, rtm, ww_ctx); + __ww_mutex_check_kill(rtm, waiter, ww_ctx); + } +#endif return -EDEADLK; + } raw_spin_lock(&task->pi_lock); waiter->task = task; diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 5c9299aaabae..900220941caa 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -21,12 +21,13 @@ int max_lock_depth = 1024; */ static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock, unsigned int state, + struct lockdep_map *nest_lock, unsigned int subclass) { int ret; might_sleep(); - mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, _RET_IP_); ret = __rt_mutex_lock(&lock->rtmutex, state); if (ret) mutex_release(&lock->dep_map, _RET_IP_); @@ -48,10 +49,16 @@ EXPORT_SYMBOL(rt_mutex_base_init); */ void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) { - __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass); + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass); } EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); +void __sched _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock) +{ + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0); +} +EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock); + #else /* !CONFIG_DEBUG_LOCK_ALLOC */ /** @@ -61,7 +68,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); */ void __sched rt_mutex_lock(struct rt_mutex *lock) { - __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0); + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0); } EXPORT_SYMBOL_GPL(rt_mutex_lock); #endif @@ -77,10 +84,25 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock); */ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) { - return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0); + return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, NULL, 0); } EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); +/** + * rt_mutex_lock_killable - lock a rt_mutex killable + * + * @lock: the rt_mutex to be locked + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + */ +int __sched rt_mutex_lock_killable(struct rt_mutex *lock) +{ + return __rt_mutex_lock_common(lock, TASK_KILLABLE, NULL, 0); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable); + /** * rt_mutex_trylock - try to lock a rt_mutex * diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c index d2912e44d61f..9e396a09fe0f 100644 --- a/kernel/locking/spinlock_rt.c +++ b/kernel/locking/spinlock_rt.c @@ -24,6 +24,17 @@ #define RT_MUTEX_BUILD_SPINLOCKS #include "rtmutex.c" +/* + * __might_resched() skips the state check as rtlocks are state + * preserving. Take RCU nesting into account as spin/read/write_lock() can + * legitimately nest into an RCU read side critical section. + */ +#define RTLOCK_RESCHED_OFFSETS \ + (rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT) + +#define rtlock_might_resched() \ + __might_resched(__FILE__, __LINE__, RTLOCK_RESCHED_OFFSETS) + static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) { if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current))) @@ -32,7 +43,7 @@ static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) static __always_inline void __rt_spin_lock(spinlock_t *lock) { - ___might_sleep(__FILE__, __LINE__, 0); + rtlock_might_resched(); rtlock_lock(&lock->lock); rcu_read_lock(); migrate_disable(); @@ -210,7 +221,7 @@ EXPORT_SYMBOL(rt_write_trylock); void __sched rt_read_lock(rwlock_t *rwlock) { - ___might_sleep(__FILE__, __LINE__, 0); + rtlock_might_resched(); rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); rwbase_read_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); rcu_read_lock(); @@ -220,7 +231,7 @@ EXPORT_SYMBOL(rt_read_lock); void __sched rt_write_lock(rwlock_t *rwlock) { - ___might_sleep(__FILE__, __LINE__, 0); + rtlock_might_resched(); rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); rcu_read_lock(); @@ -246,12 +257,6 @@ void __sched rt_write_unlock(rwlock_t *rwlock) } EXPORT_SYMBOL(rt_write_unlock); -int __sched rt_rwlock_is_contended(rwlock_t *rwlock) -{ - return rw_base_is_contended(&rwlock->rwbase); -} -EXPORT_SYMBOL(rt_rwlock_is_contended); - #ifdef CONFIG_DEBUG_LOCK_ALLOC void __rt_rwlock_init(rwlock_t *rwlock, const char *name, struct lock_class_key *key) diff --git a/kernel/panic.c b/kernel/panic.c index cefd7d82366f..d509c0694af9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -178,12 +178,28 @@ static void panic_print_sys_info(void) void panic(const char *fmt, ...) { static char buf[1024]; + va_list args2; va_list args; long i, i_next = 0, len; int state = 0; int old_cpu, this_cpu; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; + console_verbose(); + pr_emerg("Kernel panic - not syncing:\n"); + va_start(args2, fmt); + va_copy(args, args2); + vprintk(fmt, args2); + va_end(args2); +#ifdef CONFIG_DEBUG_BUGVERBOSE + /* + * Avoid nested stack-dumping if a panic occurs during oops processing + */ + if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) + dump_stack(); +#endif + pr_flush(1000, true); + /* * Disable local interrupts. This will prevent panic_smp_self_stop * from deadlocking the first cpu that invokes the panic, since @@ -214,24 +230,13 @@ void panic(const char *fmt, ...) if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu) panic_smp_self_stop(); - console_verbose(); bust_spinlocks(1); - va_start(args, fmt); len = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); if (len && buf[len - 1] == '\n') buf[len - 1] = '\0'; - pr_emerg("Kernel panic - not syncing: %s\n", buf); -#ifdef CONFIG_DEBUG_BUGVERBOSE - /* - * Avoid nested stack-dumping if a panic occurs during oops processing - */ - if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) - dump_stack(); -#endif - /* * If kgdb is enabled, give it a chance to run before we stop all * the other CPUs or else we won't be able to debug processes left @@ -540,9 +545,11 @@ static u64 oops_id; static int init_oops_id(void) { +#ifndef CONFIG_PREEMPT_RT if (!oops_id) get_random_bytes(&oops_id, sizeof(oops_id)); else +#endif oops_id++; return 0; @@ -553,6 +560,7 @@ static void print_oops_end_marker(void) { init_oops_id(); pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); + pr_flush(1000, true); } /* diff --git a/kernel/power/main.c b/kernel/power/main.c index 7e646079fbeb..8b153aa90ecc 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -546,14 +546,13 @@ static int __init pm_debug_messages_setup(char *str) __setup("pm_debug_messages", pm_debug_messages_setup); /** - * __pm_pr_dbg - Print a suspend debug message to the kernel log. - * @defer: Whether or not to use printk_deferred() to print the message. + * pm_pr_dbg - Print a suspend debug message to the kernel log. * @fmt: Message format. * * The message will be emitted if enabled through the pm_debug_messages * sysfs attribute. */ -void __pm_pr_dbg(bool defer, const char *fmt, ...) +void pm_pr_dbg(const char *fmt, ...) { struct va_format vaf; va_list args; @@ -566,10 +565,7 @@ void __pm_pr_dbg(bool defer, const char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; - if (defer) - printk_deferred(KERN_DEBUG "PM: %pV", &vaf); - else - printk(KERN_DEBUG "PM: %pV", &vaf); + printk(KERN_DEBUG "PM: %pV", &vaf); va_end(args); } diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index d118739874c0..bc6b856a0ff4 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,6 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y = printk.o -obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o obj-$(CONFIG_PRINTK) += printk_ringbuffer.o obj-$(CONFIG_PRINTK_INDEX) += index.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 9f3ed2fdb721..de8ab059dd96 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -2,7 +2,6 @@ /* * internal.h - printk internal definitions */ -#include #ifdef CONFIG_PRINTK @@ -12,41 +11,6 @@ enum printk_info_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; -__printf(4, 0) -int vprintk_store(int facility, int level, - const struct dev_printk_info *dev_info, - const char *fmt, va_list args); - -__printf(1, 0) int vprintk_default(const char *fmt, va_list args); -__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); - -bool printk_percpu_data_ready(void); - -#define printk_safe_enter_irqsave(flags) \ - do { \ - local_irq_save(flags); \ - __printk_safe_enter(); \ - } while (0) - -#define printk_safe_exit_irqrestore(flags) \ - do { \ - __printk_safe_exit(); \ - local_irq_restore(flags); \ - } while (0) - -void defer_console_output(void); - u16 printk_parse_prefix(const char *text, int *level, enum printk_info_flags *flags); -#else - -/* - * In !PRINTK builds we still export console_sem - * semaphore and some of console functions (console_unlock()/etc.), so - * printk-safe must preserve the existing local IRQ guarantees. - */ -#define printk_safe_enter_irqsave(flags) local_irq_save(flags) -#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags) - -static inline bool printk_percpu_data_ready(void) { return false; } #endif /* CONFIG_PRINTK */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 8d856b7c2e5a..ac2c44792ec4 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -44,6 +44,10 @@ #include #include #include +#include +#include +#include +#include #include #include #include @@ -227,19 +231,7 @@ static int nr_ext_console_drivers; static int __down_trylock_console_sem(unsigned long ip) { - int lock_failed; - unsigned long flags; - - /* - * Here and in __up_console_sem() we need to be in safe mode, - * because spindump/WARN/etc from under console ->lock will - * deadlock in printk()->down_trylock_console_sem() otherwise. - */ - printk_safe_enter_irqsave(flags); - lock_failed = down_trylock(&console_sem); - printk_safe_exit_irqrestore(flags); - - if (lock_failed) + if (down_trylock(&console_sem)) return 1; mutex_acquire(&console_lock_dep_map, 0, 1, ip); return 0; @@ -248,13 +240,9 @@ static int __down_trylock_console_sem(unsigned long ip) static void __up_console_sem(unsigned long ip) { - unsigned long flags; - mutex_release(&console_lock_dep_map, ip); - printk_safe_enter_irqsave(flags); up(&console_sem); - printk_safe_exit_irqrestore(flags); } #define up_console_sem() __up_console_sem(_RET_IP_) @@ -268,11 +256,6 @@ static void __up_console_sem(unsigned long ip) */ static int console_locked, console_suspended; -/* - * If exclusive_console is non-NULL then only this console is to be printed to. - */ -static struct console *exclusive_console; - /* * Array of consoles built from command line options (console=) */ @@ -352,10 +335,13 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; * non-prinatable characters are escaped in the "\xff" notation. */ +#ifdef CONFIG_PRINTK /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); -#ifdef CONFIG_PRINTK +/* Set to enable sync mode. Once set, it is never cleared. */ +static bool sync_mode; + DECLARE_WAIT_QUEUE_HEAD(log_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ @@ -363,17 +349,6 @@ static u64 syslog_seq; static size_t syslog_partial; static bool syslog_time; -/* All 3 protected by @console_sem. */ -/* the next printk record to write to the console */ -static u64 console_seq; -static u64 exclusive_console_stop_seq; -static unsigned long console_dropped; - -struct latched_seq { - seqcount_latch_t latch; - u64 val[2]; -}; - /* * The next printk record to read after the last 'clear' command. There are * two copies (updated with seqcount_latch) so that reads can locklessly @@ -391,9 +366,6 @@ static struct latched_seq clear_seq = { #define PREFIX_MAX 32 #endif -/* the maximum size of a formatted record (i.e. with prefix added per line) */ -#define CONSOLE_LOG_MAX 1024 - /* the maximum size allowed to be reserved for a record */ #define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX) @@ -432,12 +404,12 @@ static struct printk_ringbuffer *prb = &printk_rb_static; */ static bool __printk_percpu_data_ready __read_mostly; -bool printk_percpu_data_ready(void) +static bool printk_percpu_data_ready(void) { return __printk_percpu_data_ready; } -/* Must be called under syslog_lock. */ +/* Must be called under associated write-protection lock. */ static void latched_seq_write(struct latched_seq *ls, u64 val) { raw_write_seqcount_latch(&ls->latch); @@ -1771,188 +1743,152 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) return do_syslog(type, buf, len, SYSLOG_FROM_READER); } -/* - * Special console_lock variants that help to reduce the risk of soft-lockups. - * They allow to pass console_lock to another printk() call using a busy wait. - */ +int printk_delay_msec __read_mostly; -#ifdef CONFIG_LOCKDEP -static struct lockdep_map console_owner_dep_map = { - .name = "console_owner" -}; -#endif +static inline void printk_delay(int level) +{ + boot_delay_msec(level); -static DEFINE_RAW_SPINLOCK(console_owner_lock); -static struct task_struct *console_owner; -static bool console_waiter; + if (unlikely(printk_delay_msec)) { + int m = printk_delay_msec; -/** - * console_lock_spinning_enable - mark beginning of code where another - * thread might safely busy wait - * - * This basically converts console_lock into a spinlock. This marks - * the section where the console_lock owner can not sleep, because - * there may be a waiter spinning (like a spinlock). Also it must be - * ready to hand over the lock at the end of the section. - */ -static void console_lock_spinning_enable(void) + while (m--) { + mdelay(1); + touch_nmi_watchdog(); + } + } +} + +static bool kernel_sync_mode(void) { - raw_spin_lock(&console_owner_lock); - console_owner = current; - raw_spin_unlock(&console_owner_lock); + return (oops_in_progress || sync_mode); +} - /* The waiter may spin on us after setting console_owner */ - spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); +static bool console_may_sync(struct console *con) +{ + if (!(con->flags & CON_ENABLED)) + return false; + if (con->write_atomic && kernel_sync_mode()) + return true; + if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) + return true; + if (con->write && (con->flags & CON_BOOT) && !con->thread) + return true; + return false; } -/** - * console_lock_spinning_disable_and_check - mark end of code where another - * thread was able to busy wait and check if there is a waiter - * - * This is called at the end of the section where spinning is allowed. - * It has two functions. First, it is a signal that it is no longer - * safe to start busy waiting for the lock. Second, it checks if - * there is a busy waiter and passes the lock rights to her. - * - * Important: Callers lose the lock if there was a busy waiter. - * They must not touch items synchronized by console_lock - * in this case. - * - * Return: 1 if the lock rights were passed, 0 otherwise. - */ -static int console_lock_spinning_disable_and_check(void) +static bool call_sync_console_driver(struct console *con, const char *text, size_t text_len) { - int waiter; + if (!(con->flags & CON_ENABLED)) + return false; - raw_spin_lock(&console_owner_lock); - waiter = READ_ONCE(console_waiter); - console_owner = NULL; - raw_spin_unlock(&console_owner_lock); + if (con->write_atomic && kernel_sync_mode()) { + con->write_atomic(con, text, text_len); + return true; + } - if (!waiter) { - spin_release(&console_owner_dep_map, _THIS_IP_); - return 0; + if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) { + if (console_trylock()) { + con->write_atomic(con, text, text_len); + console_unlock(); + return true; + } + + } else if (con->write && (con->flags & CON_BOOT) && !con->thread) { + if (console_trylock()) { + con->write(con, text, text_len); + console_unlock(); + return true; + } } - /* The waiter is now free to continue */ - WRITE_ONCE(console_waiter, false); + return false; +} - spin_release(&console_owner_dep_map, _THIS_IP_); +static bool have_atomic_console(void) +{ + struct console *con; - /* - * Hand off console_lock to waiter. The waiter will perform - * the up(). After this, the waiter is the console_lock owner. - */ - mutex_release(&console_lock_dep_map, _THIS_IP_); - return 1; + for_each_console(con) { + if (!(con->flags & CON_ENABLED)) + continue; + if (con->write_atomic) + return true; + } + return false; } -/** - * console_trylock_spinning - try to get console_lock by busy waiting - * - * This allows to busy wait for the console_lock when the current - * owner is running in specially marked sections. It means that - * the current owner is running and cannot reschedule until it - * is ready to lose the lock. - * - * Return: 1 if we got the lock, 0 othrewise - */ -static int console_trylock_spinning(void) +static bool print_sync(struct console *con, u64 *seq) { - struct task_struct *owner = NULL; - bool waiter; - bool spin = false; - unsigned long flags; + struct printk_info info; + struct printk_record r; + size_t text_len; - if (console_trylock()) - return 1; + prb_rec_init_rd(&r, &info, &con->sync_buf[0], sizeof(con->sync_buf)); - printk_safe_enter_irqsave(flags); + if (!prb_read_valid(prb, *seq, &r)) + return false; - raw_spin_lock(&console_owner_lock); - owner = READ_ONCE(console_owner); - waiter = READ_ONCE(console_waiter); - if (!waiter && owner && owner != current) { - WRITE_ONCE(console_waiter, true); - spin = true; - } - raw_spin_unlock(&console_owner_lock); + text_len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); - /* - * If there is an active printk() writing to the - * consoles, instead of having it write our data too, - * see if we can offload that load from the active - * printer, and do some printing ourselves. - * Go into a spin only if there isn't already a waiter - * spinning, and there is an active printer, and - * that active printer isn't us (recursive printk?). - */ - if (!spin) { - printk_safe_exit_irqrestore(flags); - return 0; - } + if (!call_sync_console_driver(con, &con->sync_buf[0], text_len)) + return false; - /* We spin waiting for the owner to release us */ - spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); - /* Owner will clear console_waiter on hand off */ - while (READ_ONCE(console_waiter)) - cpu_relax(); - spin_release(&console_owner_dep_map, _THIS_IP_); + *seq = r.info->seq; - printk_safe_exit_irqrestore(flags); - /* - * The owner passed the console lock to us. - * Since we did not spin on console lock, annotate - * this as a trylock. Otherwise lockdep will - * complain. - */ - mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); + touch_softlockup_watchdog_sync(); + clocksource_touch_watchdog(); + rcu_cpu_stall_reset(); + touch_nmi_watchdog(); - return 1; + if (text_len) + printk_delay(r.info->level); + + return true; } -/* - * Call the console drivers, asking them to write out - * log_buf[start] to log_buf[end - 1]. - * The console_lock must be held. - */ -static void call_console_drivers(const char *ext_text, size_t ext_len, - const char *text, size_t len) +static u64 read_console_seq(struct console *con) { - static char dropped_text[64]; - size_t dropped_len = 0; - struct console *con; + u64 seq2; + u64 seq; - trace_console_rcuidle(text, len); + seq = latched_seq_read_nolock(&con->printk_seq); + seq2 = latched_seq_read_nolock(&con->printk_sync_seq); + if (seq2 > seq) + seq = seq2; +#ifdef CONFIG_HAVE_NMI + seq2 = latched_seq_read_nolock(&con->printk_sync_nmi_seq); + if (seq2 > seq) + seq = seq2; +#endif + return seq; +} - if (!console_drivers) - return; +static void print_sync_until(struct console *con, u64 seq, bool is_locked) +{ + u64 printk_seq; - if (console_dropped) { - dropped_len = snprintf(dropped_text, sizeof(dropped_text), - "** %lu printk messages dropped **\n", - console_dropped); - console_dropped = 0; - } + while (!__printk_cpu_trylock()) + cpu_relax(); - for_each_console(con) { - if (exclusive_console && con != exclusive_console) - continue; - if (!(con->flags & CON_ENABLED)) - continue; - if (!con->write) - continue; - if (!cpu_online(smp_processor_id()) && - !(con->flags & CON_ANYTIME)) - continue; - if (con->flags & CON_EXTENDED) - con->write(con, ext_text, ext_len); - else { - if (dropped_len) - con->write(con, dropped_text, dropped_len); - con->write(con, text, len); - } + for (;;) { + printk_seq = read_console_seq(con); + if (printk_seq >= seq) + break; + if (!print_sync(con, &printk_seq)) + break; + + if (is_locked) + latched_seq_write(&con->printk_seq, printk_seq + 1); +#ifdef CONFIG_PRINTK_NMI + else if (in_nmi()) + latched_seq_write(&con->printk_sync_nmi_seq, printk_seq + 1); +#endif + else + latched_seq_write(&con->printk_sync_seq, printk_seq + 1); } + + __printk_cpu_unlock(); } /* @@ -2025,20 +1961,6 @@ static u8 *__printk_recursion_counter(void) local_irq_restore(flags); \ } while (0) -int printk_delay_msec __read_mostly; - -static inline void printk_delay(void) -{ - if (unlikely(printk_delay_msec)) { - int m = printk_delay_msec; - - while (m--) { - mdelay(1); - touch_nmi_watchdog(); - } - } -} - static inline u32 printk_caller_id(void) { return in_task() ? task_pid_nr(current) : @@ -2119,13 +2041,14 @@ static u16 printk_sprint(char *text, u16 size, int facility, } __printf(4, 0) -int vprintk_store(int facility, int level, - const struct dev_printk_info *dev_info, - const char *fmt, va_list args) +static int vprintk_store(int facility, int level, + const struct dev_printk_info *dev_info, + const char *fmt, va_list args) { const u32 caller_id = printk_caller_id(); struct prb_reserved_entry e; enum printk_info_flags flags = 0; + bool final_commit = false; struct printk_record r; unsigned long irqflags; u16 trunc_msg_len = 0; @@ -2136,6 +2059,7 @@ int vprintk_store(int facility, int level, u16 text_len; int ret = 0; u64 ts_nsec; + u64 seq; /* * Since the duration of printk() can vary depending on the message @@ -2174,6 +2098,7 @@ int vprintk_store(int facility, int level, if (flags & LOG_CONT) { prb_rec_init_wr(&r, reserve_size); if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { + seq = r.info->seq; text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, facility, &flags, fmt, args); r.info->text_len += text_len; @@ -2181,6 +2106,7 @@ int vprintk_store(int facility, int level, if (flags & LOG_NEWLINE) { r.info->flags |= LOG_NEWLINE; prb_final_commit(&e); + final_commit = true; } else { prb_commit(&e); } @@ -2204,6 +2130,7 @@ int vprintk_store(int facility, int level, if (!prb_reserve(&e, prb, &r)) goto out; } + seq = r.info->seq; /* fill message */ text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args); @@ -2219,13 +2146,25 @@ int vprintk_store(int facility, int level, memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); /* A message without a trailing newline can be continued. */ - if (!(flags & LOG_NEWLINE)) + if (!(flags & LOG_NEWLINE)) { prb_commit(&e); - else + } else { prb_final_commit(&e); + final_commit = true; + } ret = text_len + trunc_msg_len; out: + /* only the kernel may perform synchronous printing */ + if (facility == 0 && final_commit) { + struct console *con; + + for_each_console(con) { + if (console_may_sync(con)) + print_sync_until(con, seq + 1, false); + } + } + printk_exit_irqrestore(recursion_ptr, irqflags); return ret; } @@ -2235,50 +2174,43 @@ asmlinkage int vprintk_emit(int facility, int level, const char *fmt, va_list args) { int printed_len; - bool in_sched = false; /* Suppress unimportant messages after panic happens */ if (unlikely(suppress_printk)) return 0; - if (level == LOGLEVEL_SCHED) { + if (level == LOGLEVEL_SCHED) level = LOGLEVEL_DEFAULT; - in_sched = true; - } - - boot_delay_msec(level); - printk_delay(); printed_len = vprintk_store(facility, level, dev_info, fmt, args); - /* If called from the scheduler, we can not call up(). */ - if (!in_sched) { - /* - * Disable preemption to avoid being preempted while holding - * console_sem which would prevent anyone from printing to - * console - */ - preempt_disable(); - /* - * Try to acquire and then immediately release the console - * semaphore. The release will print out buffers and wake up - * /dev/kmsg and syslog() users. - */ - if (console_trylock_spinning()) - console_unlock(); - preempt_enable(); - } - wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); -int vprintk_default(const char *fmt, va_list args) +__printf(1, 0) +static int vprintk_default(const char *fmt, va_list args) { return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); } -EXPORT_SYMBOL_GPL(vprintk_default); + +__printf(1, 0) +static int vprintk_func(const char *fmt, va_list args) +{ +#ifdef CONFIG_KGDB_KDB + /* Allow to pass printk() to kdb but avoid a recursion. */ + if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) + return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); +#endif + return vprintk_default(fmt, args); +} + +asmlinkage int vprintk(const char *fmt, va_list args) +{ + return vprintk_func(fmt, args); +} +EXPORT_SYMBOL(vprintk); asmlinkage __visible int _printk(const char *fmt, ...) { @@ -2293,37 +2225,162 @@ asmlinkage __visible int _printk(const char *fmt, ...) } EXPORT_SYMBOL(_printk); -#else /* CONFIG_PRINTK */ +static int printk_kthread_func(void *data) +{ + struct console *con = data; + unsigned long dropped = 0; + char *dropped_text = NULL; + struct printk_info info; + struct printk_record r; + char *ext_text = NULL; + size_t dropped_len; + int ret = -ENOMEM; + char *text = NULL; + char *write_text; + size_t len; + int error; + u64 seq; + + if (con->flags & CON_EXTENDED) { + ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL); + if (!ext_text) + goto out; + } + text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); + dropped_text = kmalloc(64, GFP_KERNEL); + if (!text || !dropped_text) + goto out; + if (con->flags & CON_EXTENDED) + write_text = ext_text; + else + write_text = text; + + seq = read_console_seq(con); -#define CONSOLE_LOG_MAX 0 -#define printk_time false + prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); -#define prb_read_valid(rb, seq, r) false -#define prb_first_valid_seq(rb) 0 + for (;;) { + error = wait_event_interruptible(log_wait, + prb_read_valid(prb, seq, &r) || kthread_should_stop()); -static u64 syslog_seq; -static u64 console_seq; -static u64 exclusive_console_stop_seq; -static unsigned long console_dropped; + if (kthread_should_stop()) + break; + + if (error) + continue; + + if (seq != r.info->seq) { + dropped += r.info->seq - seq; + seq = r.info->seq; + } + + seq++; + + if (!(con->flags & CON_ENABLED)) + continue; + + if (suppress_message_printing(r.info->level)) + continue; + + if (con->flags & CON_EXTENDED) { + len = info_print_ext_header(ext_text, + CONSOLE_EXT_LOG_MAX, + r.info); + len += msg_print_ext_body(ext_text + len, + CONSOLE_EXT_LOG_MAX - len, + &r.text_buf[0], r.info->text_len, + &r.info->dev_info); + } else { + len = record_print_text(&r, + console_msg_format & MSG_FORMAT_SYSLOG, + printk_time); + } + + console_lock(); + + /* + * Even though the printk kthread is always preemptible, it is + * still not allowed to call cond_resched() from within + * console drivers. The task may become non-preemptible in the + * console driver call chain. For example, vt_console_print() + * takes a spinlock and then can call into fbcon_redraw(), + * which can conditionally invoke cond_resched(). + */ + console_may_schedule = 0; + + if (kernel_sync_mode() && con->write_atomic) { + console_unlock(); + break; + } + + if (!(con->flags & CON_EXTENDED) && dropped) { + dropped_len = snprintf(dropped_text, 64, + "** %lu printk messages dropped **\n", + dropped); + dropped = 0; + + con->write(con, dropped_text, dropped_len); + printk_delay(r.info->level); + } + + con->write(con, write_text, len); + if (len) + printk_delay(r.info->level); + + latched_seq_write(&con->printk_seq, seq); + + console_unlock(); + } + ret = 0; +out: + kfree(dropped_text); + kfree(text); + kfree(ext_text); + pr_info("%sconsole [%s%d]: printing thread stopped\n", + (con->flags & CON_BOOT) ? "boot" : "", + con->name, con->index); + return ret; +} -static size_t record_print_text(const struct printk_record *r, - bool syslog, bool time) +/* Must be called within console_lock(). */ +static void start_printk_kthread(struct console *con) { - return 0; + con->thread = kthread_run(printk_kthread_func, con, + "pr/%s%d", con->name, con->index); + if (IS_ERR(con->thread)) { + pr_err("%sconsole [%s%d]: unable to start printing thread\n", + (con->flags & CON_BOOT) ? "boot" : "", + con->name, con->index); + return; + } + pr_info("%sconsole [%s%d]: printing thread started\n", + (con->flags & CON_BOOT) ? "boot" : "", + con->name, con->index); } -static ssize_t info_print_ext_header(char *buf, size_t size, - struct printk_info *info) + +/* protected by console_lock */ +static bool kthreads_started; + +/* Must be called within console_lock(). */ +static void console_try_thread(struct console *con) { - return 0; + if (kthreads_started) { + start_printk_kthread(con); + return; + } + + /* + * The printing threads have not been started yet. If this console + * can print synchronously, print all unprinted messages. + */ + if (console_may_sync(con)) { + unsigned long flags; + + local_irq_save(flags); + print_sync_until(con, prb_next_seq(prb), true); + local_irq_restore(flags); + } } -static ssize_t msg_print_ext_body(char *buf, size_t size, - char *text, size_t text_len, - struct dev_printk_info *dev_info) { return 0; } -static void console_lock_spinning_enable(void) { } -static int console_lock_spinning_disable_and_check(void) { return 0; } -static void call_console_drivers(const char *ext_text, size_t ext_len, - const char *text, size_t len) {} -static bool suppress_message_printing(int level) { return false; } #endif /* CONFIG_PRINTK */ @@ -2580,34 +2637,6 @@ int is_console_locked(void) } EXPORT_SYMBOL(is_console_locked); -/* - * Check if we have any console that is capable of printing while cpu is - * booting or shutting down. Requires console_sem. - */ -static int have_callable_console(void) -{ - struct console *con; - - for_each_console(con) - if ((con->flags & CON_ENABLED) && - (con->flags & CON_ANYTIME)) - return 1; - - return 0; -} - -/* - * Can we actually use the console at this time on this cpu? - * - * Console drivers may assume that per-cpu resources have been allocated. So - * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't - * call them until this CPU is officially up. - */ -static inline int can_use_console(void) -{ - return cpu_online(raw_smp_processor_id()) || have_callable_console(); -} - /** * console_unlock - unlock the console system * @@ -2624,140 +2653,13 @@ static inline int can_use_console(void) */ void console_unlock(void) { - static char ext_text[CONSOLE_EXT_LOG_MAX]; - static char text[CONSOLE_LOG_MAX]; - unsigned long flags; - bool do_cond_resched, retry; - struct printk_info info; - struct printk_record r; - u64 __maybe_unused next_seq; - if (console_suspended) { up_console_sem(); return; } - prb_rec_init_rd(&r, &info, text, sizeof(text)); - - /* - * Console drivers are called with interrupts disabled, so - * @console_may_schedule should be cleared before; however, we may - * end up dumping a lot of lines, for example, if called from - * console registration path, and should invoke cond_resched() - * between lines if allowable. Not doing so can cause a very long - * scheduling stall on a slow console leading to RCU stall and - * softlockup warnings which exacerbate the issue with more - * messages practically incapacitating the system. - * - * console_trylock() is not able to detect the preemptive - * context reliably. Therefore the value must be stored before - * and cleared after the "again" goto label. - */ - do_cond_resched = console_may_schedule; -again: - console_may_schedule = 0; - - /* - * We released the console_sem lock, so we need to recheck if - * cpu is online and (if not) is there at least one CON_ANYTIME - * console. - */ - if (!can_use_console()) { - console_locked = 0; - up_console_sem(); - return; - } - - for (;;) { - size_t ext_len = 0; - int handover; - size_t len; - -skip: - if (!prb_read_valid(prb, console_seq, &r)) - break; - - if (console_seq != r.info->seq) { - console_dropped += r.info->seq - console_seq; - console_seq = r.info->seq; - } - - if (suppress_message_printing(r.info->level)) { - /* - * Skip record we have buffered and already printed - * directly to the console when we received it, and - * record that has level above the console loglevel. - */ - console_seq++; - goto skip; - } - - /* Output to all consoles once old messages replayed. */ - if (unlikely(exclusive_console && - console_seq >= exclusive_console_stop_seq)) { - exclusive_console = NULL; - } - - /* - * Handle extended console text first because later - * record_print_text() will modify the record buffer in-place. - */ - if (nr_ext_console_drivers) { - ext_len = info_print_ext_header(ext_text, - sizeof(ext_text), - r.info); - ext_len += msg_print_ext_body(ext_text + ext_len, - sizeof(ext_text) - ext_len, - &r.text_buf[0], - r.info->text_len, - &r.info->dev_info); - } - len = record_print_text(&r, - console_msg_format & MSG_FORMAT_SYSLOG, - printk_time); - console_seq++; - - /* - * While actively printing out messages, if another printk() - * were to occur on another CPU, it may wait for this one to - * finish. This task can not be preempted if there is a - * waiter waiting to take over. - * - * Interrupts are disabled because the hand over to a waiter - * must not be interrupted until the hand over is completed - * (@console_waiter is cleared). - */ - printk_safe_enter_irqsave(flags); - console_lock_spinning_enable(); - - stop_critical_timings(); /* don't trace print latency */ - call_console_drivers(ext_text, ext_len, text, len); - start_critical_timings(); - - handover = console_lock_spinning_disable_and_check(); - printk_safe_exit_irqrestore(flags); - if (handover) - return; - - if (do_cond_resched) - cond_resched(); - } - - /* Get consistent value of the next-to-be-used sequence number. */ - next_seq = console_seq; - console_locked = 0; up_console_sem(); - - /* - * Someone could have filled up the buffer again, so re-check if there's - * something to flush. In case we cannot trylock the console_sem again, - * there's a new owner and the console_unlock() from them will do the - * flush, no worries. - */ - retry = prb_read_valid(prb, next_seq, NULL); - if (retry && console_trylock()) - goto again; } EXPORT_SYMBOL(console_unlock); @@ -2807,18 +2709,20 @@ void console_unblank(void) */ void console_flush_on_panic(enum con_flush_mode mode) { - /* - * If someone else is holding the console lock, trylock will fail - * and may_schedule may be set. Ignore and proceed to unlock so - * that messages are flushed out. As this can be called from any - * context and we don't want to get preempted while flushing, - * ensure may_schedule is cleared. - */ - console_trylock(); - console_may_schedule = 0; + if (!console_trylock()) + return; + +#ifdef CONFIG_PRINTK + if (mode == CONSOLE_REPLAY_ALL) { + struct console *c; + u64 seq; + + seq = prb_first_valid_seq(prb); + for_each_console(c) + latched_seq_write(&c->printk_seq, seq); + } +#endif - if (mode == CONSOLE_REPLAY_ALL) - console_seq = prb_first_valid_seq(prb); console_unlock(); } @@ -2954,6 +2858,7 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) void register_console(struct console *newcon) { struct console *bcon = NULL; + u64 __maybe_unused seq = 0; int err; for_each_console(bcon) { @@ -2976,6 +2881,8 @@ void register_console(struct console *newcon) } } + newcon->thread = NULL; + if (console_drivers && console_drivers->flags & CON_BOOT) bcon = console_drivers; @@ -3017,8 +2924,10 @@ void register_console(struct console *newcon) * the real console are the same physical device, it's annoying to * see the beginning boot messages twice */ - if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) + if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { newcon->flags &= ~CON_PRINTBUFFER; + newcon->flags |= CON_HANDOVER; + } /* * Put this console in the list - keep the @@ -3040,27 +2949,21 @@ void register_console(struct console *newcon) if (newcon->flags & CON_EXTENDED) nr_ext_console_drivers++; - if (newcon->flags & CON_PRINTBUFFER) { - /* - * console_unlock(); will print out the buffered messages - * for us. - * - * We're about to replay the log buffer. Only do this to the - * just-registered console to avoid excessive message spam to - * the already-registered consoles. - * - * Set exclusive_console with disabled interrupts to reduce - * race window with eventual console_flush_on_panic() that - * ignores console_lock. - */ - exclusive_console = newcon; - exclusive_console_stop_seq = console_seq; +#ifdef CONFIG_PRINTK + if (!(newcon->flags & CON_PRINTBUFFER)) + seq = prb_next_seq(prb); - /* Get a consistent copy of @syslog_seq. */ - mutex_lock(&syslog_lock); - console_seq = syslog_seq; - mutex_unlock(&syslog_lock); - } + seqcount_latch_init(&newcon->printk_seq.latch); + latched_seq_write(&newcon->printk_seq, seq); + seqcount_latch_init(&newcon->printk_sync_seq.latch); + latched_seq_write(&newcon->printk_sync_seq, seq); +#ifdef CONFIG_HAVE_NMI + seqcount_latch_init(&newcon->printk_sync_nmi_seq.latch); + latched_seq_write(&newcon->printk_sync_nmi_seq, seq); +#endif + + console_try_thread(newcon); +#endif /* CONFIG_PRINTK */ console_unlock(); console_sysfs_notify(); @@ -3134,6 +3037,9 @@ int unregister_console(struct console *console) console_unlock(); console_sysfs_notify(); + if (console->thread && !IS_ERR(console->thread)) + kthread_stop(console->thread); + if (console->exit) res = console->exit(console); @@ -3216,6 +3122,15 @@ static int __init printk_late_init(void) unregister_console(con); } } + +#ifdef CONFIG_PRINTK + console_lock(); + for_each_console(con) + start_printk_kthread(con); + kthreads_started = true; + console_unlock(); +#endif + ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, console_cpu_notify); WARN_ON(ret < 0); @@ -3239,14 +3154,8 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) { int pending = this_cpu_xchg(printk_pending, 0); - if (pending & PRINTK_PENDING_OUTPUT) { - /* If trylock fails, someone else is doing the printing */ - if (console_trylock()) - console_unlock(); - } - if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible(&log_wait); + wake_up_interruptible_all(&log_wait); } static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = @@ -3293,29 +3202,7 @@ void defer_console_output(void) void printk_trigger_flush(void) { - defer_console_output(); -} - -int vprintk_deferred(const char *fmt, va_list args) -{ - int r; - - r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); - defer_console_output(); - - return r; -} - -int _printk_deferred(const char *fmt, ...) -{ - va_list args; - int r; - - va_start(args, fmt); - r = vprintk_deferred(fmt, args); - va_end(args); - - return r; + wake_up_klogd(); } /* @@ -3444,6 +3331,24 @@ void kmsg_dump(enum kmsg_dump_reason reason) { struct kmsg_dumper *dumper; + if (!oops_in_progress) { + /* + * If atomic consoles are available, activate kernel sync mode + * to make sure any final messages are visible. The trailing + * printk message is important to flush any pending messages. + */ + if (have_atomic_console()) { + sync_mode = true; + pr_info("enabled sync mode\n"); + } + + /* + * Give the printing threads time to flush, allowing up to + * 1s of no printing forward progress before giving up. + */ + pr_flush(1000, true); + } + rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) { enum kmsg_dump_reason max_reason = dumper->max_reason; @@ -3626,6 +3531,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_rewind); #ifdef CONFIG_SMP static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1); static atomic_t printk_cpulock_nested = ATOMIC_INIT(0); +static unsigned int kgdb_cpu = -1; /** * __printk_wait_on_cpu_lock() - Busy wait until the printk cpu-reentrant @@ -3705,6 +3611,9 @@ EXPORT_SYMBOL(__printk_cpu_trylock); */ void __printk_cpu_unlock(void) { + bool trigger_kgdb = false; + unsigned int cpu; + if (atomic_read(&printk_cpulock_nested)) { atomic_dec(&printk_cpulock_nested); return; @@ -3715,6 +3624,12 @@ void __printk_cpu_unlock(void) * LMM(__printk_cpu_unlock:A) */ + cpu = smp_processor_id(); + if (kgdb_cpu == cpu) { + trigger_kgdb = true; + kgdb_cpu = -1; + } + /* * Guarantee loads and stores from this CPU when it was the * lock owner are visible to the next lock owner. This pairs @@ -3735,6 +3650,98 @@ void __printk_cpu_unlock(void) */ atomic_set_release(&printk_cpulock_owner, -1); /* LMM(__printk_cpu_unlock:B) */ + + if (trigger_kgdb) { + pr_warn("re-triggering kgdb roundup for CPU#%d\n", cpu); + kgdb_roundup_cpu(cpu); + } } EXPORT_SYMBOL(__printk_cpu_unlock); + +bool kgdb_roundup_delay(unsigned int cpu) +{ + if (cpu != atomic_read(&printk_cpulock_owner)) + return false; + + kgdb_cpu = cpu; + return true; +} +EXPORT_SYMBOL(kgdb_roundup_delay); #endif /* CONFIG_SMP */ + +#ifdef CONFIG_PRINTK +static void pr_msleep(bool may_sleep, int ms) +{ + if (may_sleep) { + msleep(ms); + } else { + while (ms--) + udelay(1000); + } +} + +/** + * pr_flush() - Wait for printing threads to catch up. + * + * @timeout_ms: The maximum time (in ms) to wait. + * @reset_on_progress: Reset the timeout if forward progress is seen. + * + * A value of 0 for @timeout_ms means no waiting will occur. A value of -1 + * represents infinite waiting. + * + * If @reset_on_progress is true, the timeout will be reset whenever any + * printer has been seen to make some forward progress. + * + * Context: Any context. + * Return: true if all enabled printers are caught up. + */ +bool pr_flush(int timeout_ms, bool reset_on_progress) +{ + int remaining = timeout_ms; + struct console *con; + u64 last_diff = 0; + bool may_sleep; + u64 printk_seq; + u64 diff; + u64 seq; + + may_sleep = (preemptible() && + !in_softirq() && + system_state >= SYSTEM_RUNNING); + + seq = prb_next_seq(prb); + + for (;;) { + diff = 0; + + for_each_console(con) { + if (!(con->flags & CON_ENABLED)) + continue; + printk_seq = read_console_seq(con); + if (printk_seq < seq) + diff += seq - printk_seq; + } + + if (diff != last_diff && reset_on_progress) + remaining = timeout_ms; + + if (diff == 0 || remaining == 0) + break; + + if (remaining < 0) { + pr_msleep(may_sleep, 100); + } else if (remaining < 100) { + pr_msleep(may_sleep, remaining); + remaining = 0; + } else { + pr_msleep(may_sleep, 100); + remaining -= 100; + } + + last_diff = diff; + } + + return (diff == 0); +} +EXPORT_SYMBOL(pr_flush); +#endif /* CONFIG_PRINTK */ diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c deleted file mode 100644 index ef0f9a2044da..000000000000 --- a/kernel/printk/printk_safe.c +++ /dev/null @@ -1,52 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * printk_safe.c - Safe printk for printk-deadlock-prone contexts - */ - -#include -#include -#include -#include -#include -#include - -#include "internal.h" - -static DEFINE_PER_CPU(int, printk_context); - -/* Can be preempted by NMI. */ -void __printk_safe_enter(void) -{ - this_cpu_inc(printk_context); -} - -/* Can be preempted by NMI. */ -void __printk_safe_exit(void) -{ - this_cpu_dec(printk_context); -} - -asmlinkage int vprintk(const char *fmt, va_list args) -{ -#ifdef CONFIG_KGDB_KDB - /* Allow to pass printk() to kdb but avoid a recursion. */ - if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) - return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); -#endif - - /* - * Use the main logbuf even in NMI. But avoid calling console - * drivers that might have their own locks. - */ - if (this_cpu_read(printk_context) || in_nmi()) { - int len; - - len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); - defer_console_output(); - return len; - } - - /* No obstacles. */ - return vprintk_default(fmt, args); -} -EXPORT_SYMBOL(vprintk); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 0cf547531ddf..0df2de214daa 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -197,7 +197,18 @@ static bool ptrace_freeze_traced(struct task_struct *task) spin_lock_irq(&task->sighand->siglock); if (task_is_traced(task) && !looks_like_a_spurious_pid(task) && !__fatal_signal_pending(task)) { +#ifdef CONFIG_PREEMPT_RT + unsigned long flags; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + if (READ_ONCE(task->__state) & __TASK_TRACED) + WRITE_ONCE(task->__state, __TASK_TRACED); + else + task->saved_state = __TASK_TRACED; + raw_spin_unlock_irqrestore(&task->pi_lock, flags); +#else WRITE_ONCE(task->__state, __TASK_TRACED); +#endif ret = true; } spin_unlock_irq(&task->sighand->siglock); @@ -207,7 +218,11 @@ static bool ptrace_freeze_traced(struct task_struct *task) static void ptrace_unfreeze_traced(struct task_struct *task) { - if (READ_ONCE(task->__state) != __TASK_TRACED) + unsigned long flags; + bool frozen = true; + + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && + READ_ONCE(task->__state) != __TASK_TRACED) return; WARN_ON(!task->ptrace || task->parent != current); @@ -217,12 +232,21 @@ static void ptrace_unfreeze_traced(struct task_struct *task) * Recheck state under the lock to close this race. */ spin_lock_irq(&task->sighand->siglock); - if (READ_ONCE(task->__state) == __TASK_TRACED) { - if (__fatal_signal_pending(task)) - wake_up_state(task, __TASK_TRACED); - else - WRITE_ONCE(task->__state, TASK_TRACED); - } + raw_spin_lock_irqsave(&task->pi_lock, flags); + if (READ_ONCE(task->__state) == __TASK_TRACED) + WRITE_ONCE(task->__state, TASK_TRACED); + +#ifdef CONFIG_PREEMPT_RT + else if (task->saved_state == __TASK_TRACED) + task->saved_state = TASK_TRACED; +#endif + else + frozen = false; + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + if (frozen && __fatal_signal_pending(task)) + wake_up_state(task, __TASK_TRACED); + spin_unlock_irq(&task->sighand->siglock); } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 60c9eacac25b..2dbcd58383e3 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1350,7 +1350,7 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp) rttd->notrun = true; } -static void rcu_tasks_initiate_self_tests(void) +void rcu_tasks_initiate_self_tests(void) { pr_info("Running RCU-tasks wait API self tests\n"); #ifdef CONFIG_TASKS_RCU @@ -1387,9 +1387,7 @@ static int rcu_tasks_verify_self_tests(void) return ret; } late_initcall(rcu_tasks_verify_self_tests); -#else /* #ifdef CONFIG_PROVE_RCU */ -static void rcu_tasks_initiate_self_tests(void) { } -#endif /* #else #ifdef CONFIG_PROVE_RCU */ +#endif /* #ifdef CONFIG_PROVE_RCU */ void __init rcu_init_tasks_generic(void) { @@ -1404,9 +1402,6 @@ void __init rcu_init_tasks_generic(void) #ifdef CONFIG_TASKS_TRACE_RCU rcu_spawn_tasks_trace_kthread(); #endif - - // Run the self-tests. - rcu_tasks_initiate_self_tests(); } #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4a9d68b1fdc..350b2bc83051 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2279,13 +2279,13 @@ rcu_report_qs_rdp(struct rcu_data *rdp) { unsigned long flags; unsigned long mask; - bool needwake = false; - const bool offloaded = rcu_rdp_is_offloaded(rdp); + bool offloaded, needwake = false; struct rcu_node *rnp; WARN_ON_ONCE(rdp->cpu != smp_processor_id()); rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); + offloaded = rcu_rdp_is_offloaded(rdp); if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq || rdp->gpwrap) { @@ -2447,7 +2447,7 @@ static void rcu_do_batch(struct rcu_data *rdp) int div; bool __maybe_unused empty; unsigned long flags; - const bool offloaded = rcu_rdp_is_offloaded(rdp); + bool offloaded; struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); long bl, count = 0; @@ -2473,6 +2473,7 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_nocb_lock(rdp); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); pending = rcu_segcblist_n_cbs(&rdp->cblist); + offloaded = rcu_rdp_is_offloaded(rdp); div = READ_ONCE(rcu_divisor); div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; bl = max(rdp->blimit, pending >> div); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b89ca5c83143..f47f5724edca 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -75,7 +75,11 @@ __read_mostly int sysctl_resched_latency_warn_once = 1; * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ +#ifdef CONFIG_PREEMPT_RT +const_debug unsigned int sysctl_sched_nr_migrate = 8; +#else const_debug unsigned int sysctl_sched_nr_migrate = 32; +#endif /* * period over which we measure -rt task CPU usage in us. @@ -983,6 +987,46 @@ void resched_curr(struct rq *rq) trace_sched_wake_idle_without_ipi(cpu); } +#ifdef CONFIG_PREEMPT_LAZY + +static int tsk_is_polling(struct task_struct *p) +{ +#ifdef TIF_POLLING_NRFLAG + return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); +#else + return 0; +#endif +} + +void resched_curr_lazy(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + int cpu; + + if (!sched_feat(PREEMPT_LAZY)) { + resched_curr(rq); + return; + } + + if (test_tsk_need_resched(curr)) + return; + + if (test_tsk_need_resched_lazy(curr)) + return; + + set_tsk_need_resched_lazy(curr); + + cpu = cpu_of(rq); + if (cpu == smp_processor_id()) + return; + + /* NEED_RESCHED_LAZY must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(curr)) + smp_send_reschedule(cpu); +} +#endif + void resched_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -2138,6 +2182,7 @@ void migrate_disable(void) preempt_disable(); this_rq()->nr_pinned++; p->migration_disabled = 1; + preempt_lazy_disable(); preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_disable); @@ -2149,6 +2194,8 @@ void migrate_enable(void) if (p->migration_disabled > 1) { p->migration_disabled--; return; + } else if (WARN_ON_ONCE(p->migration_disabled == 0)) { + return; } /* @@ -2166,6 +2213,7 @@ void migrate_enable(void) barrier(); p->migration_disabled = 0; this_rq()->nr_pinned--; + preempt_lazy_enable(); preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); @@ -2945,9 +2993,8 @@ void force_compatible_cpus_allowed_ptr(struct task_struct *p) out_set_mask: if (printk_ratelimit()) { - printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n", - task_pid_nr(p), p->comm, - cpumask_pr_args(override_mask)); + printk("Overriding affinity for process %d (%s) to CPUs %*pbl\n", + task_pid_nr(p), p->comm, cpumask_pr_args(override_mask)); } WARN_ON(set_cpus_allowed_ptr(p, override_mask)); @@ -3203,7 +3250,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state * is actually now running somewhere else! */ while (task_running(rq, p)) { - if (match_state && unlikely(READ_ONCE(p->__state) != match_state)) + if (match_state && !task_match_state_lock(p, match_state)) return 0; cpu_relax(); } @@ -3218,7 +3265,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state running = task_running(rq, p); queued = task_on_rq_queued(p); ncsw = 0; - if (!match_state || READ_ONCE(p->__state) == match_state) + if (!match_state || task_match_state_or_saved(p, match_state)) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, p, &rf); @@ -3252,7 +3299,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state ktime_t to = NSEC_PER_SEC / HZ; set_current_state(TASK_UNINTERRUPTIBLE); - schedule_hrtimeout(&to, HRTIMER_MODE_REL); + schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); continue; } @@ -3377,8 +3424,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * leave kernel. */ if (p->mm && printk_ratelimit()) { - printk_deferred("process %d (%s) no longer affine to cpu%d\n", - task_pid_nr(p), p->comm, cpu); + printk("process %d (%s) no longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); } } @@ -4386,6 +4433,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->on_cpu = 0; #endif init_task_preempt_count(p); +#ifdef CONFIG_HAVE_PREEMPT_LAZY + task_thread_info(p)->preempt_lazy_count = 0; +#endif #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); @@ -4880,20 +4930,18 @@ static struct rq *finish_task_switch(struct task_struct *prev) */ if (mm) { membarrier_mm_sync_core_before_usermode(mm); - mmdrop(mm); + mmdrop_sched(mm); } if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. + * Release VMAP'ed task stack immediate for reuse. On RT + * enabled kernels this is delayed for latency reasons. */ - kprobe_flush_task(prev); - - /* Task is done with its stack. */ - put_task_stack(prev); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + put_task_stack(prev); put_task_struct_rcu_user(prev); } @@ -6294,6 +6342,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); + clear_tsk_need_resched_lazy(prev); clear_preempt_need_resched(); #ifdef CONFIG_SCHED_DEBUG rq->last_seen_need_resched_ns = 0; @@ -6511,6 +6560,30 @@ static void __sched notrace preempt_schedule_common(void) } while (need_resched()); } +#ifdef CONFIG_PREEMPT_LAZY +/* + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as + * preempt_lazy_count counter >0. + */ +static __always_inline int preemptible_lazy(void) +{ + if (test_thread_flag(TIF_NEED_RESCHED)) + return 1; + if (current_thread_info()->preempt_lazy_count) + return 0; + return 1; +} + +#else + +static inline int preemptible_lazy(void) +{ + return 1; +} + +#endif + #ifdef CONFIG_PREEMPTION /* * This is the entry point to schedule() from in-kernel preemption @@ -6524,7 +6597,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) */ if (likely(!preemptible())) return; - + if (!preemptible_lazy()) + return; preempt_schedule_common(); } NOKPROBE_SYMBOL(preempt_schedule); @@ -6557,6 +6631,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) if (likely(!preemptible())) return; + if (!preemptible_lazy()) + return; + do { /* * Because the function tracer can trace preempt_count_sub() @@ -8709,7 +8786,9 @@ void __init init_idle(struct task_struct *idle, int cpu) /* Set the preempt count _outside_ the spinlocks! */ init_idle_preempt_count(idle, cpu); - +#ifdef CONFIG_HAVE_PREEMPT_LAZY + task_thread_info(idle)->preempt_lazy_count = 0; +#endif /* * The idle tasks have their own, simple scheduling class: */ @@ -9503,14 +9582,8 @@ void __init sched_init(void) } #ifdef CONFIG_DEBUG_ATOMIC_SLEEP -static inline int preempt_count_equals(int preempt_offset) -{ - int nested = preempt_count() + rcu_preempt_depth(); - - return (nested == preempt_offset); -} -void __might_sleep(const char *file, int line, int preempt_offset) +void __might_sleep(const char *file, int line) { unsigned int state = get_current_state(); /* @@ -9524,11 +9597,32 @@ void __might_sleep(const char *file, int line, int preempt_offset) (void *)current->task_state_change, (void *)current->task_state_change); - ___might_sleep(file, line, preempt_offset); + __might_resched(file, line, 0); } EXPORT_SYMBOL(__might_sleep); -void ___might_sleep(const char *file, int line, int preempt_offset) +static void print_preempt_disable_ip(int preempt_offset, unsigned long ip) +{ + if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT)) + return; + + if (preempt_count() == preempt_offset) + return; + + pr_err("Preemption disabled at:"); + print_ip_sym(KERN_ERR, ip); +} + +static inline bool resched_offsets_ok(unsigned int offsets) +{ + unsigned int nested = preempt_count(); + + nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT; + + return nested == offsets; +} + +void __might_resched(const char *file, int line, unsigned int offsets) { /* Ratelimiting timestamp: */ static unsigned long prev_jiffy; @@ -9538,7 +9632,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) /* WARN_ON_ONCE() by default, no rate limit required: */ rcu_sleep_check(); - if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && + if ((resched_offsets_ok(offsets) && !irqs_disabled() && !is_idle_task(current) && !current->non_block_count) || system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || oops_in_progress) @@ -9551,29 +9645,33 @@ void ___might_sleep(const char *file, int line, int preempt_offset) /* Save this before calling printk(), since that will clobber it: */ preempt_disable_ip = get_preempt_disable_ip(current); - printk(KERN_ERR - "BUG: sleeping function called from invalid context at %s:%d\n", - file, line); - printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), current->non_block_count, - current->pid, current->comm); + pr_err("BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), current->non_block_count, + current->pid, current->comm); + pr_err("preempt_count: %x, expected: %x\n", preempt_count(), + offsets & MIGHT_RESCHED_PREEMPT_MASK); + + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { + pr_err("RCU nest depth: %d, expected: %u\n", + rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT); + } if (task_stack_end_corrupted(current)) - printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); + pr_emerg("Thread overran stack, or stack corrupted\n"); debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); - if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) - && !preempt_count_equals(preempt_offset)) { - pr_err("Preemption disabled at:"); - print_ip_sym(KERN_ERR, preempt_disable_ip); - } + + print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK, + preempt_disable_ip); + dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } -EXPORT_SYMBOL(___might_sleep); +EXPORT_SYMBOL(__might_resched); void __cant_sleep(const char *file, int line, int preempt_offset) { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fffcb1aa77b7..2799117917c7 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -800,7 +800,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) * entity. */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { - printk_deferred_once("sched: DL replenish lagged too much\n"); + printk_once("sched: DL replenish lagged too much\n"); dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; dl_se->runtime = pi_of(dl_se)->dl_runtime; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fcbacc35d2b9..99e1ccd10439 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4247,10 +4247,7 @@ static inline void check_schedstat_required(void) trace_sched_stat_iowait_enabled() || trace_sched_stat_blocked_enabled() || trace_sched_stat_runtime_enabled()) { - printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " - "stat_blocked and stat_runtime require the " - "kernel parameter schedstats=enable or " - "kernel.sched_schedstats=1\n"); + printk_once("Scheduler tracepoints stat_sleep, stat_iowait, stat_blocked and stat_runtime require the kernel parameter schedstats=enable or kernel.sched_schedstats=1\n"); } #endif } @@ -4458,7 +4455,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. @@ -4482,7 +4479,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) return; if (delta > ideal_runtime) - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); } static void @@ -4625,7 +4622,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); return; } /* @@ -4765,7 +4762,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) * hierarchy can be throttled */ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); } static __always_inline @@ -5528,7 +5525,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) if (delta < 0) { if (task_current(rq, p)) - resched_curr(rq); + resched_curr_lazy(rq); return; } hrtick_start(rq, delta); @@ -7220,7 +7217,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; preempt: - resched_curr(rq); + resched_curr_lazy(rq); /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved @@ -11123,7 +11120,7 @@ static void task_fork_fair(struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); - resched_curr(rq); + resched_curr_lazy(rq); } se->vruntime -= cfs_rq->min_vruntime; @@ -11150,7 +11147,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) */ if (task_current(rq, p)) { if (p->prio > oldprio) - resched_curr(rq); + resched_curr_lazy(rq); } else check_preempt_curr(rq, p, 0); } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 7f8dace0964c..d5cee51819bf 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -46,11 +46,19 @@ SCHED_FEAT(DOUBLE_TICK, false) */ SCHED_FEAT(NONTASK_CAPACITY, true) +#ifdef CONFIG_PREEMPT_RT +SCHED_FEAT(TTWU_QUEUE, false) +# ifdef CONFIG_PREEMPT_LAZY +SCHED_FEAT(PREEMPT_LAZY, true) +# endif +#else + /* * Queue remote wakeups on the target CPU and process them * using the scheduler IPI. Reduces rq->lock contention/bounces. */ SCHED_FEAT(TTWU_QUEUE, true) +#endif /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index cad2a1b34ed0..fca1bfa2763f 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -717,11 +717,10 @@ static void psi_group_change(struct psi_group *group, int cpu, if (groupc->tasks[t]) { groupc->tasks[t]--; } else if (!psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n", - cpu, t, groupc->tasks[0], - groupc->tasks[1], groupc->tasks[2], - groupc->tasks[3], groupc->tasks[4], - clear, set); + pr_err("psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", + cpu, t, groupc->tasks[0], + groupc->tasks[1], groupc->tasks[2], + groupc->tasks[3], clear, set); psi_bug = 1; } } @@ -787,9 +786,9 @@ static void psi_flags_change(struct task_struct *task, int clear, int set) if (((task->psi_flags & set) || (task->psi_flags & clear) != clear) && !psi_bug) { - printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", - task->pid, task->comm, task_cpu(task), - task->psi_flags, clear, set); + pr_err("psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", + task->pid, task->comm, task_cpu(task), + task->psi_flags, clear, set); psi_bug = 1; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8007d087a57f..6ba8c7bdcdae 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -983,7 +983,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) */ if (likely(rt_b->rt_runtime)) { rt_rq->rt_throttled = 1; - printk_deferred_once("sched: RT throttling activated\n"); + printk_once("sched: RT throttling activated\n"); } else { /* * In case we did anyway, make it go away, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fe8be2f8a47d..38f9078fcaaa 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2323,6 +2323,15 @@ extern void reweight_task(struct task_struct *p, int prio); extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); +#ifdef CONFIG_PREEMPT_LAZY +extern void resched_curr_lazy(struct rq *rq); +#else +static inline void resched_curr_lazy(struct rq *rq) +{ + resched_curr(rq); +} +#endif + extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index e1c655f928c7..f230b1ac7f91 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -64,6 +64,7 @@ void swake_up_all(struct swait_queue_head *q) struct swait_queue *curr; LIST_HEAD(tmp); + WARN_ON(irqs_disabled()); raw_spin_lock_irq(&q->lock); list_splice_init(&q->task_list, &tmp); while (!list_empty(&tmp)) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 4e8698e62f07..3d0157bd4e14 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -526,7 +526,7 @@ static int init_rootdomain(struct root_domain *rd) #ifdef HAVE_RT_PUSH_IPI rd->rto_cpu = -1; raw_spin_lock_init(&rd->rto_lock); - init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); + rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func); #endif rd->visit_gen = 0; diff --git a/kernel/signal.c b/kernel/signal.c index d831f0aec56e..24fee2a3788a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1324,6 +1324,34 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, struct k_sigaction *action; int sig = info->si_signo; + /* + * On some archs, PREEMPT_RT has to delay sending a signal from a trap + * since it can not enable preemption, and the signal code's spin_locks + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will + * send the signal on exit of the trap. + */ +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND + if (in_atomic()) { + struct task_struct *t = current; + + if (WARN_ON_ONCE(t->forced_info.si_signo)) + return 0; + + if (is_si_special(info)) { + WARN_ON_ONCE(info != SEND_SIG_PRIV); + t->forced_info.si_signo = info->si_signo; + t->forced_info.si_errno = 0; + t->forced_info.si_code = SI_KERNEL; + t->forced_info.si_pid = 0; + t->forced_info.si_uid = 0; + } else { + t->forced_info = *info; + } + + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); + return 0; + } +#endif spin_lock_irqsave(&t->sighand->siglock, flags); action = &t->sighand->action[sig-1]; ignored = action->sa.sa_handler == SIG_IGN; @@ -2308,16 +2336,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t if (gstop_done && ptrace_reparented(current)) do_notify_parent_cldstop(current, false, why); - /* - * Don't want to allow preemption here, because - * sys_ptrace() needs this task to be inactive. - * - * XXX: implement read_unlock_no_resched(). - */ - preempt_disable(); read_unlock(&tasklist_lock); cgroup_enter_frozen(); - preempt_enable_no_resched(); freezable_schedule(); cgroup_leave_frozen(true); } else { diff --git a/kernel/smp.c b/kernel/smp.c index b68d63e965db..d00f1dda09c6 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -690,10 +690,20 @@ void flush_smp_call_function_from_idle(void) cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU, smp_processor_id(), CFD_SEQ_IDLE); + local_irq_save(flags); flush_smp_call_function_queue(true); - if (local_softirq_pending()) - do_softirq(); + + if (local_softirq_pending()) { + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + do_softirq(); + } else { + struct task_struct *ksoftirqd = this_cpu_ksoftirqd(); + + if (ksoftirqd && !task_is_running(ksoftirqd)) + wake_up_process(ksoftirqd); + } + } local_irq_restore(flags); } diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 003ccf338d20..00fc43605c6b 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -203,8 +203,7 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev) { /* Nothing to do if we already reached the limit */ if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { - printk_deferred(KERN_WARNING - "CE: Reprogramming failure. Giving up\n"); + pr_warn("CE: Reprogramming failure. Giving up\n"); dev->next_event = KTIME_MAX; return -ETIME; } @@ -217,10 +216,8 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev) if (dev->min_delta_ns > MIN_DELTA_LIMIT) dev->min_delta_ns = MIN_DELTA_LIMIT; - printk_deferred(KERN_WARNING - "CE: %s increased min_delta_ns to %llu nsec\n", - dev->name ? dev->name : "?", - (unsigned long long) dev->min_delta_ns); + pr_warn("CE: %s increased min_delta_ns to %llu nsec\n", + dev->name ? dev->name : "?", (unsigned long long) dev->min_delta_ns); return 0; } diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 406dccb79c2b..829d7797811f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -939,9 +939,7 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) time_status |= STA_PPSERROR; pps_errcnt++; pps_dec_freq_interval(); - printk_deferred(KERN_ERR - "hardpps: PPSERROR: interval too long - %lld s\n", - freq_norm.sec); + pr_err("hardpps: PPSERROR: interval too long - %lld s\n", freq_norm.sec); return 0; } @@ -954,8 +952,7 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); pps_freq = ftemp; if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { - printk_deferred(KERN_WARNING - "hardpps: PPSWANDER: change=%ld\n", delta); + pr_warn("hardpps: PPSWANDER: change=%ld\n", delta); time_status |= STA_PPSWANDER; pps_stbcnt++; pps_dec_freq_interval(); @@ -999,9 +996,8 @@ static void hardpps_update_phase(long error) * the time offset is updated. */ if (jitter > (pps_jitter << PPS_POPCORN)) { - printk_deferred(KERN_WARNING - "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", - jitter, (pps_jitter << PPS_POPCORN)); + pr_warn("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", + jitter, (pps_jitter << PPS_POPCORN)); time_status |= STA_PPSJITTER; pps_jitcnt++; } else if (time_status & STA_PPSTIME) { @@ -1058,7 +1054,7 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t time_status |= STA_PPSJITTER; /* restart the frequency calibration interval */ pps_fbase = *raw_ts; - printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); + pr_err("hardpps: PPSJITTER: bad pulse\n"); return; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 871c912860ed..f18cad9a14df 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -204,22 +204,23 @@ static void timekeeping_check_update(struct timekeeper *tk, u64 offset) const char *name = tk->tkr_mono.clock->name; if (offset > max_cycles) { - printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", - offset, name, max_cycles); - printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); + printk("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", + offset, name, max_cycles); + printk(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); } else { if (offset > (max_cycles >> 1)) { - printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n", - offset, name, max_cycles >> 1); - printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); + printk("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n", + offset, name, max_cycles >> 1); + printk(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); } } if (tk->underflow_seen) { if (jiffies - tk->last_warning > WARNING_FREQ) { - printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); - printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); - printk_deferred(" Your kernel is probably still fine.\n"); + printk("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", + name); + printk(" Please report this, consider using a different clocksource, if possible.\n"); + printk(" Your kernel is probably still fine.\n"); tk->last_warning = jiffies; } tk->underflow_seen = 0; @@ -227,9 +228,10 @@ static void timekeeping_check_update(struct timekeeper *tk, u64 offset) if (tk->overflow_seen) { if (jiffies - tk->last_warning > WARNING_FREQ) { - printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); - printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); - printk_deferred(" Your kernel is probably still fine.\n"); + printk("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", + name); + printk(" Please report this, consider using a different clocksource, if possible.\n"); + printk(" Your kernel is probably still fine.\n"); tk->last_warning = jiffies; } tk->overflow_seen = 0; @@ -1669,9 +1671,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, const struct timespec64 *delta) { if (!timespec64_valid_strict(delta)) { - printk_deferred(KERN_WARNING - "__timekeeping_inject_sleeptime: Invalid " - "sleep delta value!\n"); + pr_warn("%s: Invalid sleep delta value!\n", __func__); return; } tk_xtime_add(tk, delta); diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index b73e8850e58d..149cc4b08d8e 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -49,7 +49,7 @@ void tk_debug_account_sleep_time(const struct timespec64 *t) int bin = min(fls(t->tv_sec), NUM_BINS-1); sleep_time_bin[bin]++; - pm_deferred_pr_dbg("Timekeeping suspended for %lld.%03lu seconds\n", + pm_pr_dbg("Timekeeping suspended for %lld.%03lu seconds\n", (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 518ce39a878d..53ea832567d7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2636,7 +2636,13 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) trace_flags |= TRACE_FLAG_NEED_RESCHED; if (test_preempt_need_resched()) trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; - return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | +#ifdef CONFIG_PREEMPT_LAZY + if (need_resched_lazy()) + trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY; +#endif + + return (trace_flags << 24) | (min_t(unsigned int, pc & 0xff, 0xf)) | + (preempt_lazy_count() & 0xff) << 16 | (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; } @@ -4217,15 +4223,17 @@ unsigned long trace_total_entries(struct trace_array *tr) static void print_lat_help_header(struct seq_file *m) { - seq_puts(m, "# _------=> CPU# \n" - "# / _-----=> irqs-off \n" - "# | / _----=> need-resched \n" - "# || / _---=> hardirq/softirq \n" - "# ||| / _--=> preempt-depth \n" - "# |||| / _-=> migrate-disable \n" - "# ||||| / delay \n" - "# cmd pid |||||| time | caller \n" - "# \\ / |||||| \\ | / \n"); + seq_puts(m, "# _--------=> CPU# \n" + "# / _-------=> irqs-off \n" + "# | / _------=> need-resched \n" + "# || / _-----=> need-resched-lazy\n" + "# ||| / _----=> hardirq/softirq \n" + "# |||| / _---=> preempt-depth \n" + "# ||||| / _--=> preempt-lazy-depth\n" + "# |||||| / _-=> migrate-disable \n" + "# ||||||| / delay \n" + "# cmd pid |||||||| time | caller \n" + "# \\ / |||||||| \\ | / \n"); } static void print_event_info(struct array_buffer *buf, struct seq_file *m) @@ -4259,14 +4267,16 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file print_event_info(buf, m); - seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); - seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); - seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); - seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); - seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space); - seq_printf(m, "# %.*s|||| / delay\n", prec, space); - seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID "); - seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | "); + seq_printf(m, "# %.*s _-------=> irqs-off\n", prec, space); + seq_printf(m, "# %.*s / _------=> need-resched\n", prec, space); + seq_printf(m, "# %.*s| / _-----=> need-resched-lazy\n", prec, space); + seq_printf(m, "# %.*s|| / _----=> hardirq/softirq\n", prec, space); + seq_printf(m, "# %.*s||| / _---=> preempt-depth\n", prec, space); + seq_printf(m, "# %.*s|||| / _--=> preempt-lazy-depth\n", prec, space); + seq_printf(m, "# %.*s||||| / _-=> migrate-disable\n", prec, space); + seq_printf(m, "# %.*s|||||| / delay\n", prec, space); + seq_printf(m, "# TASK-PID %.*s CPU# ||||||| TIMESTAMP FUNCTION\n", prec, " TGID "); + seq_printf(m, "# | | %.*s | ||||||| | |\n", prec, " | "); } void diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c072e8b9849c..0098d7713f91 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -192,6 +192,7 @@ static int trace_define_common_fields(void) /* Holds both preempt_count and migrate_disable */ __common_field(unsigned char, preempt_count); __common_field(int, pid); + __common_field(unsigned char, preempt_lazy_count); return ret; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index c2ca40e8595b..be070d258c3b 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -451,6 +451,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { char hardsoft_irq; char need_resched; + char need_resched_lazy; char irqs_off; int hardirq; int softirq; @@ -481,6 +482,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) break; } + need_resched_lazy = + (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.'; + hardsoft_irq = (nmi && hardirq) ? 'Z' : nmi ? 'z' : @@ -489,14 +493,20 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) softirq ? 's' : '.' ; - trace_seq_printf(s, "%c%c%c", - irqs_off, need_resched, hardsoft_irq); + trace_seq_printf(s, "%c%c%c%c", + irqs_off, need_resched, need_resched_lazy, + hardsoft_irq); if (entry->preempt_count & 0xf) trace_seq_printf(s, "%x", entry->preempt_count & 0xf); else trace_seq_putc(s, '.'); + if (entry->preempt_lazy_count) + trace_seq_printf(s, "%x", entry->preempt_lazy_count); + else + trace_seq_putc(s, '.'); + if (entry->preempt_count & 0xf0) trace_seq_printf(s, "%x", entry->preempt_count >> 4); else diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3f4d27668576..fa66b2ac3198 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4845,9 +4845,7 @@ void show_workqueue_state(void) * drivers that queue work while holding locks * also taken in their write paths. */ - printk_deferred_enter(); show_pwq(pwq); - printk_deferred_exit(); } raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); /* @@ -4871,7 +4869,6 @@ void show_workqueue_state(void) * queue work while holding locks also taken in their write * paths. */ - printk_deferred_enter(); pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); pr_cont(" hung=%us workers=%d", @@ -4886,7 +4883,6 @@ void show_workqueue_state(void) first = false; } pr_cont("\n"); - printk_deferred_exit(); next_pool: raw_spin_unlock_irqrestore(&pool->lock, flags); /* diff --git a/lib/bug.c b/lib/bug.c index 45a0584f6541..03a87df69ed2 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -206,6 +206,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) else pr_crit("Kernel BUG at %pB [verbose debug info unavailable]\n", (void *)bugaddr); + pr_flush(1000, true); return BUG_TRAP_TYPE_BUG; } diff --git a/lib/dump_stack.c b/lib/dump_stack.c index 6b7f1bf6715d..6e8ae42c7e27 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -102,9 +102,9 @@ asmlinkage __visible void dump_stack_lvl(const char *log_lvl) * Permit this cpu to perform nested stack dumps while serialising * against other CPUs */ - printk_cpu_lock_irqsave(flags); + raw_printk_cpu_lock_irqsave(flags); __dump_stack(log_lvl); - printk_cpu_unlock_irqrestore(flags); + raw_printk_cpu_unlock_irqrestore(flags); } EXPORT_SYMBOL(dump_stack_lvl); diff --git a/lib/irq_poll.c b/lib/irq_poll.c index 2f17b488d58e..2b9f797642f6 100644 --- a/lib/irq_poll.c +++ b/lib/irq_poll.c @@ -191,11 +191,13 @@ static int irq_poll_cpu_dead(unsigned int cpu) * If a CPU goes away, splice its entries to the current CPU * and trigger a run of the softirq */ + local_bh_disable(); local_irq_disable(); list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), this_cpu_ptr(&blk_cpu_iopoll)); __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); local_irq_enable(); + local_bh_enable(); return 0; } diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 161108e5d2fe..1266ea3726d7 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -26,6 +26,12 @@ #include #include +#ifdef CONFIG_PREEMPT_RT +# define NON_RT(...) +#else +# define NON_RT(...) __VA_ARGS__ +#endif + /* * Change this to 1 if you want to see the failure printouts: */ @@ -139,7 +145,7 @@ static DEFINE_RT_MUTEX(rtmutex_Z2); #endif -static local_lock_t local_A = INIT_LOCAL_LOCK(local_A); +static DEFINE_PER_CPU(local_lock_t, local_A); /* * non-inlined runtime initializers, to let separate locks share @@ -712,12 +718,18 @@ GENERATE_TESTCASE(ABCDBCDA_rtmutex); #undef E +#ifdef CONFIG_PREEMPT_RT +# define RT_PREPARE_DBL_UNLOCK() { migrate_disable(); rcu_read_lock(); } +#else +# define RT_PREPARE_DBL_UNLOCK() +#endif /* * Double unlock: */ #define E() \ \ LOCK(A); \ + RT_PREPARE_DBL_UNLOCK(); \ UNLOCK(A); \ UNLOCK(A); /* fail */ @@ -802,6 +814,7 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock) #include "locking-selftest-wlock-hardirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-spin-softirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_spin) @@ -810,10 +823,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock) #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock) +#endif #undef E1 #undef E2 +#ifndef CONFIG_PREEMPT_RT /* * Enabling hardirqs with a softirq-safe lock held: */ @@ -846,6 +861,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock) #undef E1 #undef E2 +#endif + /* * Enabling irqs with an irq-safe lock held: */ @@ -875,6 +892,7 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock) #include "locking-selftest-wlock-hardirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-spin-softirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_spin) @@ -883,6 +901,7 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock) #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock) +#endif #undef E1 #undef E2 @@ -921,6 +940,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock) #include "locking-selftest-wlock-hardirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-spin-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_spin) @@ -929,6 +949,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock) #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock) +#endif #undef E1 #undef E2 @@ -969,6 +990,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock) #include "locking-selftest-wlock-hardirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-spin-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_spin) @@ -977,6 +999,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock) #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock) +#endif #undef E1 #undef E2 @@ -1031,6 +1054,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_hard_rlock) #include "locking-selftest-wlock-hardirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-spin-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_spin) @@ -1039,6 +1063,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_rlock) #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock) +#endif #undef E1 #undef E2 @@ -1206,12 +1231,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_hard_rlock) #include "locking-selftest-wlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-softirq.h" #include "locking-selftest-rlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft_rlock) #include "locking-selftest-wlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft_wlock) +#endif #undef E1 #undef E2 @@ -1252,12 +1279,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_hard_rlock) #include "locking-selftest-wlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-softirq.h" #include "locking-selftest-rlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft_rlock) #include "locking-selftest-wlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft_wlock) +#endif #undef E1 #undef E2 @@ -1306,12 +1335,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_hard_rlock) #include "locking-selftest-wlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-softirq.h" #include "locking-selftest-rlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_soft_rlock) #include "locking-selftest-wlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_soft_wlock) +#endif #ifdef CONFIG_DEBUG_LOCK_ALLOC # define I_SPINLOCK(x) lockdep_reset_lock(&lock_##x.dep_map) @@ -1320,7 +1351,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_soft_wlock) # define I_MUTEX(x) lockdep_reset_lock(&mutex_##x.dep_map) # define I_RWSEM(x) lockdep_reset_lock(&rwsem_##x.dep_map) # define I_WW(x) lockdep_reset_lock(&x.dep_map) -# define I_LOCAL_LOCK(x) lockdep_reset_lock(&local_##x.dep_map) +# define I_LOCAL_LOCK(x) lockdep_reset_lock(this_cpu_ptr(&local_##x.dep_map)) #ifdef CONFIG_RT_MUTEXES # define I_RTMUTEX(x) lockdep_reset_lock(&rtmutex_##x.dep_map) #endif @@ -1380,7 +1411,7 @@ static void reset_locks(void) init_shared_classes(); raw_spin_lock_init(&raw_lock_A); raw_spin_lock_init(&raw_lock_B); - local_lock_init(&local_A); + local_lock_init(this_cpu_ptr(&local_A)); ww_mutex_init(&o, &ww_lockdep); ww_mutex_init(&o2, &ww_lockdep); ww_mutex_init(&o3, &ww_lockdep); memset(&t, 0, sizeof(t)); memset(&t2, 0, sizeof(t2)); @@ -1398,7 +1429,13 @@ static int unexpected_testcase_failures; static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask) { - unsigned long saved_preempt_count = preempt_count(); + int saved_preempt_count = preempt_count(); +#ifdef CONFIG_PREEMPT_RT +#ifdef CONFIG_SMP + int saved_mgd_count = current->migration_disabled; +#endif + int saved_rcu_count = current->rcu_read_lock_nesting; +#endif WARN_ON(irqs_disabled()); @@ -1432,6 +1469,18 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask) * count, so restore it: */ preempt_count_set(saved_preempt_count); + +#ifdef CONFIG_PREEMPT_RT +#ifdef CONFIG_SMP + while (current->migration_disabled > saved_mgd_count) + migrate_enable(); +#endif + + while (current->rcu_read_lock_nesting > saved_rcu_count) + rcu_read_unlock(); + WARN_ON_ONCE(current->rcu_read_lock_nesting < saved_rcu_count); +#endif + #ifdef CONFIG_TRACE_IRQFLAGS if (softirq_count()) current->softirqs_enabled = 0; @@ -1499,7 +1548,7 @@ static inline void print_testname(const char *testname) #define DO_TESTCASE_2x2RW(desc, name, nr) \ DO_TESTCASE_2RW("hard-"desc, name##_hard, nr) \ - DO_TESTCASE_2RW("soft-"desc, name##_soft, nr) \ + NON_RT(DO_TESTCASE_2RW("soft-"desc, name##_soft, nr)) \ #define DO_TESTCASE_6x2x2RW(desc, name) \ DO_TESTCASE_2x2RW(desc, name, 123); \ @@ -1547,19 +1596,19 @@ static inline void print_testname(const char *testname) #define DO_TESTCASE_2I(desc, name, nr) \ DO_TESTCASE_1("hard-"desc, name##_hard, nr); \ - DO_TESTCASE_1("soft-"desc, name##_soft, nr); + NON_RT(DO_TESTCASE_1("soft-"desc, name##_soft, nr)); #define DO_TESTCASE_2IB(desc, name, nr) \ DO_TESTCASE_1B("hard-"desc, name##_hard, nr); \ - DO_TESTCASE_1B("soft-"desc, name##_soft, nr); + NON_RT(DO_TESTCASE_1B("soft-"desc, name##_soft, nr)); #define DO_TESTCASE_6I(desc, name, nr) \ DO_TESTCASE_3("hard-"desc, name##_hard, nr); \ - DO_TESTCASE_3("soft-"desc, name##_soft, nr); + NON_RT(DO_TESTCASE_3("soft-"desc, name##_soft, nr)); #define DO_TESTCASE_6IRW(desc, name, nr) \ DO_TESTCASE_3RW("hard-"desc, name##_hard, nr); \ - DO_TESTCASE_3RW("soft-"desc, name##_soft, nr); + NON_RT(DO_TESTCASE_3RW("soft-"desc, name##_soft, nr)); #define DO_TESTCASE_2x3(desc, name) \ DO_TESTCASE_3(desc, name, 12); \ @@ -1651,6 +1700,20 @@ static void ww_test_fail_acquire(void) #endif } +#ifdef CONFIG_PREEMPT_RT +#define ww_mutex_base_lock(b) rt_mutex_lock(b) +#define ww_mutex_base_lock_nest_lock(b, b2) rt_mutex_lock_nest_lock(b, b2) +#define ww_mutex_base_lock_interruptible(b) rt_mutex_lock_interruptible(b) +#define ww_mutex_base_lock_killable(b) rt_mutex_lock_killable(b) +#define ww_mutex_base_unlock(b) rt_mutex_unlock(b) +#else +#define ww_mutex_base_lock(b) mutex_lock(b) +#define ww_mutex_base_lock_nest_lock(b, b2) mutex_lock_nest_lock(b, b2) +#define ww_mutex_base_lock_interruptible(b) mutex_lock_interruptible(b) +#define ww_mutex_base_lock_killable(b) mutex_lock_killable(b) +#define ww_mutex_base_unlock(b) mutex_unlock(b) +#endif + static void ww_test_normal(void) { int ret; @@ -1665,50 +1728,50 @@ static void ww_test_normal(void) /* mutex_lock (and indirectly, mutex_lock_nested) */ o.ctx = (void *)~0UL; - mutex_lock(&o.base); - mutex_unlock(&o.base); + ww_mutex_base_lock(&o.base); + ww_mutex_base_unlock(&o.base); WARN_ON(o.ctx != (void *)~0UL); /* mutex_lock_interruptible (and *_nested) */ o.ctx = (void *)~0UL; - ret = mutex_lock_interruptible(&o.base); + ret = ww_mutex_base_lock_interruptible(&o.base); if (!ret) - mutex_unlock(&o.base); + ww_mutex_base_unlock(&o.base); else WARN_ON(1); WARN_ON(o.ctx != (void *)~0UL); /* mutex_lock_killable (and *_nested) */ o.ctx = (void *)~0UL; - ret = mutex_lock_killable(&o.base); + ret = ww_mutex_base_lock_killable(&o.base); if (!ret) - mutex_unlock(&o.base); + ww_mutex_base_unlock(&o.base); else WARN_ON(1); WARN_ON(o.ctx != (void *)~0UL); /* trylock, succeeding */ o.ctx = (void *)~0UL; - ret = mutex_trylock(&o.base); + ret = ww_mutex_base_trylock(&o.base); WARN_ON(!ret); if (ret) - mutex_unlock(&o.base); + ww_mutex_base_unlock(&o.base); else WARN_ON(1); WARN_ON(o.ctx != (void *)~0UL); /* trylock, failing */ o.ctx = (void *)~0UL; - mutex_lock(&o.base); - ret = mutex_trylock(&o.base); + ww_mutex_base_lock(&o.base); + ret = ww_mutex_base_trylock(&o.base); WARN_ON(ret); - mutex_unlock(&o.base); + ww_mutex_base_unlock(&o.base); WARN_ON(o.ctx != (void *)~0UL); /* nest_lock */ o.ctx = (void *)~0UL; - mutex_lock_nest_lock(&o.base, &t); - mutex_unlock(&o.base); + ww_mutex_base_lock_nest_lock(&o.base, &t); + ww_mutex_base_unlock(&o.base); WARN_ON(o.ctx != (void *)~0UL); } @@ -1721,7 +1784,7 @@ static void ww_test_two_contexts(void) static void ww_test_diff_class(void) { WWAI(&t); -#ifdef CONFIG_DEBUG_MUTEXES +#ifdef DEBUG_WW_MUTEXES t.ww_class = NULL; #endif WWL(&o, &t); @@ -1785,7 +1848,7 @@ static void ww_test_edeadlk_normal(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); o2.ctx = &t2; mutex_release(&o2.base.dep_map, _THIS_IP_); @@ -1801,7 +1864,7 @@ static void ww_test_edeadlk_normal(void) o2.ctx = NULL; mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); - mutex_unlock(&o2.base); + ww_mutex_base_unlock(&o2.base); WWU(&o); WWL(&o2, &t); @@ -1811,7 +1874,7 @@ static void ww_test_edeadlk_normal_slow(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; @@ -1827,7 +1890,7 @@ static void ww_test_edeadlk_normal_slow(void) o2.ctx = NULL; mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); - mutex_unlock(&o2.base); + ww_mutex_base_unlock(&o2.base); WWU(&o); ww_mutex_lock_slow(&o2, &t); @@ -1837,7 +1900,7 @@ static void ww_test_edeadlk_no_unlock(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); o2.ctx = &t2; mutex_release(&o2.base.dep_map, _THIS_IP_); @@ -1853,7 +1916,7 @@ static void ww_test_edeadlk_no_unlock(void) o2.ctx = NULL; mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); - mutex_unlock(&o2.base); + ww_mutex_base_unlock(&o2.base); WWL(&o2, &t); } @@ -1862,7 +1925,7 @@ static void ww_test_edeadlk_no_unlock_slow(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; @@ -1878,7 +1941,7 @@ static void ww_test_edeadlk_no_unlock_slow(void) o2.ctx = NULL; mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); - mutex_unlock(&o2.base); + ww_mutex_base_unlock(&o2.base); ww_mutex_lock_slow(&o2, &t); } @@ -1887,7 +1950,7 @@ static void ww_test_edeadlk_acquire_more(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; @@ -1908,7 +1971,7 @@ static void ww_test_edeadlk_acquire_more_slow(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; @@ -1929,11 +1992,11 @@ static void ww_test_edeadlk_acquire_more_edeadlk(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; - mutex_lock(&o3.base); + ww_mutex_base_lock(&o3.base); mutex_release(&o3.base.dep_map, _THIS_IP_); o3.ctx = &t2; @@ -1955,11 +2018,11 @@ static void ww_test_edeadlk_acquire_more_edeadlk_slow(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; - mutex_lock(&o3.base); + ww_mutex_base_lock(&o3.base); mutex_release(&o3.base.dep_map, _THIS_IP_); o3.ctx = &t2; @@ -1980,7 +2043,7 @@ static void ww_test_edeadlk_acquire_wrong(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; @@ -2005,7 +2068,7 @@ static void ww_test_edeadlk_acquire_wrong_slow(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; @@ -2646,8 +2709,8 @@ static void wait_context_tests(void) static void local_lock_2(void) { - local_lock_acquire(&local_A); /* IRQ-ON */ - local_lock_release(&local_A); + local_lock(&local_A); /* IRQ-ON */ + local_unlock(&local_A); HARDIRQ_ENTER(); spin_lock(&lock_A); /* IN-IRQ */ @@ -2656,18 +2719,18 @@ static void local_lock_2(void) HARDIRQ_DISABLE(); spin_lock(&lock_A); - local_lock_acquire(&local_A); /* IN-IRQ <-> IRQ-ON cycle, false */ - local_lock_release(&local_A); + local_lock(&local_A); /* IN-IRQ <-> IRQ-ON cycle, false */ + local_unlock(&local_A); spin_unlock(&lock_A); HARDIRQ_ENABLE(); } static void local_lock_3A(void) { - local_lock_acquire(&local_A); /* IRQ-ON */ + local_lock(&local_A); /* IRQ-ON */ spin_lock(&lock_B); /* IRQ-ON */ spin_unlock(&lock_B); - local_lock_release(&local_A); + local_unlock(&local_A); HARDIRQ_ENTER(); spin_lock(&lock_A); /* IN-IRQ */ @@ -2676,18 +2739,18 @@ static void local_lock_3A(void) HARDIRQ_DISABLE(); spin_lock(&lock_A); - local_lock_acquire(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */ - local_lock_release(&local_A); + local_lock(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */ + local_unlock(&local_A); spin_unlock(&lock_A); HARDIRQ_ENABLE(); } static void local_lock_3B(void) { - local_lock_acquire(&local_A); /* IRQ-ON */ + local_lock(&local_A); /* IRQ-ON */ spin_lock(&lock_B); /* IRQ-ON */ spin_unlock(&lock_B); - local_lock_release(&local_A); + local_unlock(&local_A); HARDIRQ_ENTER(); spin_lock(&lock_A); /* IN-IRQ */ @@ -2696,8 +2759,8 @@ static void local_lock_3B(void) HARDIRQ_DISABLE(); spin_lock(&lock_A); - local_lock_acquire(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */ - local_lock_release(&local_A); + local_lock(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */ + local_unlock(&local_A); spin_unlock(&lock_A); HARDIRQ_ENABLE(); @@ -2812,7 +2875,7 @@ void locking_selftest(void) printk("------------------------\n"); printk("| Locking API testsuite:\n"); printk("----------------------------------------------------------------------------\n"); - printk(" | spin |wlock |rlock |mutex | wsem | rsem |\n"); + printk(" | spin |wlock |rlock |mutex | wsem | rsem |rtmutex\n"); printk(" --------------------------------------------------------------------------\n"); init_shared_classes(); @@ -2885,12 +2948,11 @@ void locking_selftest(void) DO_TESTCASE_6x1RR("rlock W1R2/R2R3/W3W1", W1R2_R2R3_W3W1); printk(" --------------------------------------------------------------------------\n"); - /* * irq-context testcases: */ DO_TESTCASE_2x6("irqs-on + irq-safe-A", irqsafe1); - DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A); + NON_RT(DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A)); DO_TESTCASE_2x6("safe-A + irqs-on", irqsafe2B); DO_TESTCASE_6x6("safe-A + unsafe-B #1", irqsafe3); DO_TESTCASE_6x6("safe-A + unsafe-B #2", irqsafe4); diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index 199ab201d501..06410209197a 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -99,7 +99,7 @@ bool nmi_cpu_backtrace(struct pt_regs *regs) * Allow nested NMI backtraces while serializing * against other CPUs. */ - printk_cpu_lock_irqsave(flags); + raw_printk_cpu_lock_irqsave(flags); if (!READ_ONCE(backtrace_idle) && regs && cpu_in_idle(instruction_pointer(regs))) { pr_warn("NMI backtrace for cpu %d skipped: idling at %pS\n", cpu, (void *)instruction_pointer(regs)); @@ -110,7 +110,7 @@ bool nmi_cpu_backtrace(struct pt_regs *regs) else dump_stack(); } - printk_cpu_unlock_irqrestore(flags); + raw_printk_cpu_unlock_irqrestore(flags); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return true; } diff --git a/lib/ratelimit.c b/lib/ratelimit.c index e01a93f46f83..524cf65dce53 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -47,9 +47,7 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) if (time_is_before_jiffies(rs->begin + rs->interval)) { if (rs->missed) { if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { - printk_deferred(KERN_WARNING - "%s: %d callbacks suppressed\n", - func, rs->missed); + pr_warn("%s: %d callbacks suppressed\n", func, rs->missed); rs->missed = 0; } } diff --git a/lib/scatterlist.c b/lib/scatterlist.c index abb3432ed744..d5e82e4a57ad 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -828,8 +828,7 @@ static bool sg_miter_get_next_page(struct sg_mapping_iter *miter) * stops @miter. * * Context: - * Don't care if @miter is stopped, or not proceeded yet. - * Otherwise, preemption disabled if the SG_MITER_ATOMIC is set. + * Don't care. * * Returns: * true if @miter contains the valid mapping. false if end of sg @@ -865,8 +864,7 @@ EXPORT_SYMBOL(sg_miter_skip); * @miter->addr and @miter->length point to the current mapping. * * Context: - * Preemption disabled if SG_MITER_ATOMIC. Preemption must stay disabled - * till @miter is stopped. May sleep if !SG_MITER_ATOMIC. + * May sleep if !SG_MITER_ATOMIC. * * Returns: * true if @miter contains the next mapping. false if end of sg @@ -906,8 +904,7 @@ EXPORT_SYMBOL(sg_miter_next); * need to be released during iteration. * * Context: - * Preemption disabled if the SG_MITER_ATOMIC is set. Don't care - * otherwise. + * Don't care otherwise. */ void sg_miter_stop(struct sg_mapping_iter *miter) { @@ -922,7 +919,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter) flush_dcache_page(miter->page); if (miter->__flags & SG_MITER_ATOMIC) { - WARN_ON_ONCE(preemptible()); + WARN_ON_ONCE(!pagefault_disabled()); kunmap_atomic(miter->addr); } else kunmap(miter->page); diff --git a/localversion-rt b/localversion-rt new file mode 100644 index 000000000000..8a777ac42aab --- /dev/null +++ b/localversion-rt @@ -0,0 +1 @@ +-rt47 diff --git a/mm/Kconfig b/mm/Kconfig index c048dea7e342..88778414465b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -371,7 +371,7 @@ config NOMMU_INITIAL_TRIM_EXCESS config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" - depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE + depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT select COMPACTION select XARRAY_MULTI help diff --git a/mm/memory.c b/mm/memory.c index 26d115ded4ab..307be06c9484 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5287,7 +5287,7 @@ void __might_fault(const char *file, int line) return; if (pagefault_disabled()) return; - __might_sleep(file, line, 0); + __might_sleep(file, line); #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) if (current->mm) might_lock_read(¤t->mm->mmap_lock); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a0b7afae59e9..ac537f5caa9d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3149,9 +3149,9 @@ static void drain_local_pages_wq(struct work_struct *work) * cpu which is alright but we also have to make sure to not move to * a different one. */ - preempt_disable(); + migrate_disable(); drain_local_pages(drain->zone); - preempt_enable(); + migrate_enable(); } /* diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8375eecc55de..f81f11bcce5a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1918,11 +1918,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) return ERR_PTR(err); } - vbq = &get_cpu_var(vmap_block_queue); + get_cpu_light(); + vbq = this_cpu_ptr(&vmap_block_queue); spin_lock(&vbq->lock); list_add_tail_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); - put_cpu_var(vmap_block_queue); + put_cpu_light(); return vaddr; } @@ -2001,7 +2002,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) order = get_order(size); rcu_read_lock(); - vbq = &get_cpu_var(vmap_block_queue); + get_cpu_light(); + vbq = this_cpu_ptr(&vmap_block_queue); list_for_each_entry_rcu(vb, &vbq->free, free_list) { unsigned long pages_off; @@ -2024,7 +2026,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) break; } - put_cpu_var(vmap_block_queue); + put_cpu_light(); rcu_read_unlock(); /* Allocate new block if nothing was found */ diff --git a/mm/workingset.c b/mm/workingset.c index 880d882f3325..2a9ed5aeb6fa 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -433,6 +433,8 @@ static struct list_lru shadow_nodes; void workingset_update_node(struct xa_node *node) { + struct address_space *mapping; + /* * Track non-empty nodes that contain only shadow entries; * unlink those that contain pages or are being freed. @@ -441,7 +443,8 @@ void workingset_update_node(struct xa_node *node) * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ - VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ + mapping = container_of(node->array, struct address_space, i_pages); + lockdep_assert_held(&mapping->i_pages.xa_lock); if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 439deb8decbc..a66431853394 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -57,6 +57,7 @@ #include #include #include +#include #define ZSPAGE_MAGIC 0x58 @@ -77,6 +78,20 @@ #define ZS_HANDLE_SIZE (sizeof(unsigned long)) +#ifdef CONFIG_PREEMPT_RT + +struct zsmalloc_handle { + unsigned long addr; + spinlock_t lock; +}; + +#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle)) + +#else + +#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long)) +#endif + /* * Object location (, ) is encoded as * a single (unsigned long) handle value. @@ -293,6 +308,7 @@ struct zspage { }; struct mapping_area { + local_lock_t lock; char *vm_buf; /* copy buffer for objects that span pages */ char *vm_addr; /* address of kmap_atomic()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ @@ -322,7 +338,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} static int create_cache(struct zs_pool *pool) { - pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, + pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE, 0, 0, NULL); if (!pool->handle_cachep) return 1; @@ -346,10 +362,27 @@ static void destroy_cache(struct zs_pool *pool) static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) { - return (unsigned long)kmem_cache_alloc(pool->handle_cachep, - gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); + void *p; + + p = kmem_cache_alloc(pool->handle_cachep, + gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); +#ifdef CONFIG_PREEMPT_RT + if (p) { + struct zsmalloc_handle *zh = p; + + spin_lock_init(&zh->lock); + } +#endif + return (unsigned long)p; } +#ifdef CONFIG_PREEMPT_RT +static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle) +{ + return (void *)(handle & ~((1 << OBJ_TAG_BITS) - 1)); +} +#endif + static void cache_free_handle(struct zs_pool *pool, unsigned long handle) { kmem_cache_free(pool->handle_cachep, (void *)handle); @@ -368,12 +401,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) static void record_obj(unsigned long handle, unsigned long obj) { +#ifdef CONFIG_PREEMPT_RT + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + + WRITE_ONCE(zh->addr, obj); +#else /* * lsb of @obj represents handle lock while other bits * represent object value the handle is pointing so * updating shouldn't do store tearing. */ WRITE_ONCE(*(unsigned long *)handle, obj); +#endif } /* zpool driver */ @@ -455,7 +494,9 @@ MODULE_ALIAS("zpool-zsmalloc"); #endif /* CONFIG_ZPOOL */ /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ -static DEFINE_PER_CPU(struct mapping_area, zs_map_area); +static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { + .lock = INIT_LOCAL_LOCK(lock), +}; static bool is_zspage_isolated(struct zspage *zspage) { @@ -862,7 +903,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) static unsigned long handle_to_obj(unsigned long handle) { +#ifdef CONFIG_PREEMPT_RT + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + + return zh->addr; +#else return *(unsigned long *)handle; +#endif } static unsigned long obj_to_head(struct page *page, void *obj) @@ -876,22 +923,46 @@ static unsigned long obj_to_head(struct page *page, void *obj) static inline int testpin_tag(unsigned long handle) { +#ifdef CONFIG_PREEMPT_RT + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + + return spin_is_locked(&zh->lock); +#else return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); +#endif } static inline int trypin_tag(unsigned long handle) { +#ifdef CONFIG_PREEMPT_RT + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + + return spin_trylock(&zh->lock); +#else return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); +#endif } static void pin_tag(unsigned long handle) __acquires(bitlock) { +#ifdef CONFIG_PREEMPT_RT + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + + return spin_lock(&zh->lock); +#else bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); +#endif } static void unpin_tag(unsigned long handle) __releases(bitlock) { +#ifdef CONFIG_PREEMPT_RT + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + + return spin_unlock(&zh->lock); +#else bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); +#endif } static void reset_page(struct page *page) @@ -1274,7 +1345,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, class = pool->size_class[class_idx]; off = (class->size * obj_idx) & ~PAGE_MASK; - area = &get_cpu_var(zs_map_area); + local_lock(&zs_map_area.lock); + area = this_cpu_ptr(&zs_map_area); area->vm_mm = mm; if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ @@ -1328,7 +1400,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) __zs_unmap_object(area, pages, off, class->size); } - put_cpu_var(zs_map_area); + local_unlock(&zs_map_area.lock); migrate_read_unlock(zspage); unpin_tag(handle); diff --git a/net/Kconfig b/net/Kconfig index fb13460c6dab..074472dfa94a 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -294,7 +294,7 @@ config CGROUP_NET_CLASSID config NET_RX_BUSY_POLL bool - default y + default y if !PREEMPT_RT config BQL bool diff --git a/net/core/dev.c b/net/core/dev.c index b9731b267d07..86c3ea9c93c5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -225,14 +225,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) static inline void rps_lock(struct softnet_data *sd) { #ifdef CONFIG_RPS - spin_lock(&sd->input_pkt_queue.lock); + raw_spin_lock(&sd->input_pkt_queue.raw_lock); #endif } static inline void rps_unlock(struct softnet_data *sd) { #ifdef CONFIG_RPS - spin_unlock(&sd->input_pkt_queue.lock); + raw_spin_unlock(&sd->input_pkt_queue.raw_lock); #endif } @@ -365,12 +365,12 @@ static void list_netdevice(struct net_device *dev) ASSERT_RTNL(); - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); dev_base_seq_inc(net); } @@ -383,11 +383,11 @@ static void unlist_netdevice(struct net_device *dev) ASSERT_RTNL(); /* Unlink dev from the device chain */ - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } @@ -1266,15 +1266,15 @@ int dev_change_name(struct net_device *dev, const char *newname) netdev_adjacent_rename_links(dev, oldname); - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); netdev_name_node_del(dev->name_node); - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); synchronize_rcu(); - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); netdev_name_node_add(net, dev->name_node); - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); @@ -3042,6 +3042,7 @@ static void __netif_reschedule(struct Qdisc *q) sd->output_queue_tailp = &q->next_sched; raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); + preempt_check_resched_rt(); } void __netif_schedule(struct Qdisc *q) @@ -3104,6 +3105,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) __this_cpu_write(softnet_data.completion_queue, skb); raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); + preempt_check_resched_rt(); } EXPORT_SYMBOL(__dev_kfree_skb_irq); @@ -3835,7 +3837,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, * This permits qdisc->running owner to get the lock more * often and dequeue packets faster. */ +#ifdef CONFIG_PREEMPT_RT + contended = true; +#else contended = qdisc_is_running(q); +#endif if (unlikely(contended)) spin_lock(&q->busylock); @@ -4660,6 +4666,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, rps_unlock(sd); local_irq_restore(flags); + preempt_check_resched_rt(); atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); @@ -4900,7 +4907,7 @@ static int netif_rx_internal(struct sk_buff *skb) struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; - preempt_disable(); + migrate_disable(); rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); @@ -4910,14 +4917,14 @@ static int netif_rx_internal(struct sk_buff *skb) ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); - preempt_enable(); + migrate_enable(); } else #endif { unsigned int qtail; - ret = enqueue_to_backlog(skb, get_cpu(), &qtail); - put_cpu(); + ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail); + put_cpu_light(); } return ret; } @@ -4956,11 +4963,9 @@ int netif_rx_ni(struct sk_buff *skb) trace_netif_rx_ni_entry(skb); - preempt_disable(); + local_bh_disable(); err = netif_rx_internal(skb); - if (local_softirq_pending()) - do_softirq(); - preempt_enable(); + local_bh_enable(); trace_netif_rx_ni_exit(err); return err; @@ -6403,12 +6408,14 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) sd->rps_ipi_list = NULL; local_irq_enable(); + preempt_check_resched_rt(); /* Send pending IPI's to kick RPS processing on remote cpus. */ net_rps_send_ipi(remsd); } else #endif local_irq_enable(); + preempt_check_resched_rt(); } static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) @@ -6486,6 +6493,7 @@ void __napi_schedule(struct napi_struct *n) local_irq_save(flags); ____napi_schedule(this_cpu_ptr(&softnet_data), n); local_irq_restore(flags); + preempt_check_resched_rt(); } EXPORT_SYMBOL(__napi_schedule); @@ -11314,6 +11322,7 @@ static int dev_cpu_dead(unsigned int oldcpu) raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); + preempt_check_resched_rt(); #ifdef CONFIG_RPS remsd = oldsd->rps_ipi_list; @@ -11327,7 +11336,7 @@ static int dev_cpu_dead(unsigned int oldcpu) netif_rx_ni(skb); input_queue_head_incr(oldsd); } - while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { + while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx_ni(skb); input_queue_head_incr(oldsd); } @@ -11642,7 +11651,7 @@ static int __init net_dev_init(void) INIT_WORK(flush, flush_backlog); - skb_queue_head_init(&sd->input_pkt_queue); + skb_queue_head_init_raw(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); #ifdef CONFIG_XFRM_OFFLOAD skb_queue_head_init(&sd->xfrm_backlog); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 8e582e29a41e..4fcbdd71c59f 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -40,10 +40,10 @@ */ struct net_rate_estimator { - struct gnet_stats_basic_packed *bstats; + struct gnet_stats_basic_sync *bstats; spinlock_t *stats_lock; - seqcount_t *running; - struct gnet_stats_basic_cpu __percpu *cpu_bstats; + bool running; + struct gnet_stats_basic_sync __percpu *cpu_bstats; u8 ewma_log; u8 intvl_log; /* period : (250ms << intvl_log) */ @@ -60,13 +60,13 @@ struct net_rate_estimator { }; static void est_fetch_counters(struct net_rate_estimator *e, - struct gnet_stats_basic_packed *b) + struct gnet_stats_basic_sync *b) { - memset(b, 0, sizeof(*b)); + gnet_stats_basic_sync_init(b); if (e->stats_lock) spin_lock(e->stats_lock); - __gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats); + gnet_stats_add_basic(b, e->cpu_bstats, e->bstats, e->running); if (e->stats_lock) spin_unlock(e->stats_lock); @@ -76,14 +76,18 @@ static void est_fetch_counters(struct net_rate_estimator *e, static void est_timer(struct timer_list *t) { struct net_rate_estimator *est = from_timer(est, t, timer); - struct gnet_stats_basic_packed b; + struct gnet_stats_basic_sync b; + u64 b_bytes, b_packets; u64 rate, brate; est_fetch_counters(est, &b); - brate = (b.bytes - est->last_bytes) << (10 - est->intvl_log); + b_bytes = u64_stats_read(&b.bytes); + b_packets = u64_stats_read(&b.packets); + + brate = (b_bytes - est->last_bytes) << (10 - est->intvl_log); brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log); - rate = (b.packets - est->last_packets) << (10 - est->intvl_log); + rate = (b_packets - est->last_packets) << (10 - est->intvl_log); rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log); write_seqcount_begin(&est->seq); @@ -91,8 +95,8 @@ static void est_timer(struct timer_list *t) est->avpps += rate; write_seqcount_end(&est->seq); - est->last_bytes = b.bytes; - est->last_packets = b.packets; + est->last_bytes = b_bytes; + est->last_packets = b_packets; est->next_jiffies += ((HZ/4) << est->intvl_log); @@ -109,7 +113,9 @@ static void est_timer(struct timer_list *t) * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @lock: lock for statistics and control path - * @running: qdisc running seqcount + * @running: true if @bstats represents a running qdisc, thus @bstats' + * internal values might change during basic reads. Only used + * if @bstats_cpu is NULL * @opt: rate estimator configuration TLV * * Creates a new rate estimator with &bstats as source and &rate_est @@ -121,16 +127,16 @@ static void est_timer(struct timer_list *t) * Returns 0 on success or a negative error code. * */ -int gen_new_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu_bstats, +int gen_new_estimator(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, - seqcount_t *running, + bool running, struct nlattr *opt) { struct gnet_estimator *parm = nla_data(opt); struct net_rate_estimator *old, *est; - struct gnet_stats_basic_packed b; + struct gnet_stats_basic_sync b; int intvl_log; if (nla_len(opt) < sizeof(*parm)) @@ -164,8 +170,8 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, est_fetch_counters(est, &b); if (lock) local_bh_enable(); - est->last_bytes = b.bytes; - est->last_packets = b.packets; + est->last_bytes = u64_stats_read(&b.bytes); + est->last_packets = u64_stats_read(&b.packets); if (lock) spin_lock_bh(lock); @@ -214,7 +220,9 @@ EXPORT_SYMBOL(gen_kill_estimator); * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @lock: lock for statistics and control path - * @running: qdisc running seqcount (might be NULL) + * @running: true if @bstats represents a running qdisc, thus @bstats' + * internal values might change during basic reads. Only used + * if @cpu_bstats is NULL * @opt: rate estimator configuration TLV * * Replaces the configuration of a rate estimator by calling @@ -222,11 +230,11 @@ EXPORT_SYMBOL(gen_kill_estimator); * * Returns 0 on success or a negative error code. */ -int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu_bstats, +int gen_replace_estimator(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, - seqcount_t *running, struct nlattr *opt) + bool running, struct nlattr *opt) { return gen_new_estimator(bstats, cpu_bstats, rate_est, lock, running, opt); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index e491b083b348..a10335b4ba2d 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -18,7 +18,7 @@ #include #include #include - +#include static inline int gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr) @@ -114,63 +114,112 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, } EXPORT_SYMBOL(gnet_stats_start_copy); -static void -__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu) +/* Must not be inlined, due to u64_stats seqcount_t lockdep key */ +void gnet_stats_basic_sync_init(struct gnet_stats_basic_sync *b) { + u64_stats_set(&b->bytes, 0); + u64_stats_set(&b->packets, 0); + u64_stats_init(&b->syncp); +} +EXPORT_SYMBOL(gnet_stats_basic_sync_init); + +static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu) +{ + u64 t_bytes = 0, t_packets = 0; int i; for_each_possible_cpu(i) { - struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i); + struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i); unsigned int start; u64 bytes, packets; do { start = u64_stats_fetch_begin_irq(&bcpu->syncp); - bytes = bcpu->bstats.bytes; - packets = bcpu->bstats.packets; + bytes = u64_stats_read(&bcpu->bytes); + packets = u64_stats_read(&bcpu->packets); } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); - bstats->bytes += bytes; - bstats->packets += packets; + t_bytes += bytes; + t_packets += packets; + } + _bstats_update(bstats, t_bytes, t_packets); +} + +void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running) +{ + unsigned int start; + u64 bytes = 0; + u64 packets = 0; + + WARN_ON_ONCE((cpu || running) && in_hardirq()); + + if (cpu) { + gnet_stats_add_basic_cpu(bstats, cpu); + return; } + do { + if (running) + start = u64_stats_fetch_begin_irq(&b->syncp); + bytes = u64_stats_read(&b->bytes); + packets = u64_stats_read(&b->packets); + } while (running && u64_stats_fetch_retry_irq(&b->syncp, start)); + + _bstats_update(bstats, bytes, packets); } +EXPORT_SYMBOL(gnet_stats_add_basic); -void -__gnet_stats_copy_basic(const seqcount_t *running, - struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b) +static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running) { - unsigned int seq; + unsigned int start; if (cpu) { - __gnet_stats_copy_basic_cpu(bstats, cpu); + u64 t_bytes = 0, t_packets = 0; + int i; + + for_each_possible_cpu(i) { + struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i); + unsigned int start; + u64 bytes, packets; + + do { + start = u64_stats_fetch_begin_irq(&bcpu->syncp); + bytes = u64_stats_read(&bcpu->bytes); + packets = u64_stats_read(&bcpu->packets); + } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); + + t_bytes += bytes; + t_packets += packets; + } + *ret_bytes = t_bytes; + *ret_packets = t_packets; return; } do { if (running) - seq = read_seqcount_begin(running); - bstats->bytes = b->bytes; - bstats->packets = b->packets; - } while (running && read_seqcount_retry(running, seq)); + start = u64_stats_fetch_begin_irq(&b->syncp); + *ret_bytes = u64_stats_read(&b->bytes); + *ret_packets = u64_stats_read(&b->packets); + } while (running && u64_stats_fetch_retry_irq(&b->syncp, start)); } -EXPORT_SYMBOL(__gnet_stats_copy_basic); static int -___gnet_stats_copy_basic(const seqcount_t *running, - struct gnet_dump *d, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b, - int type) +___gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, + int type, bool running) { - struct gnet_stats_basic_packed bstats = {0}; + u64 bstats_bytes, bstats_packets; - __gnet_stats_copy_basic(running, &bstats, cpu, b); + gnet_stats_read_basic(&bstats_bytes, &bstats_packets, cpu, b, running); if (d->compat_tc_stats && type == TCA_STATS_BASIC) { - d->tc_stats.bytes = bstats.bytes; - d->tc_stats.packets = bstats.packets; + d->tc_stats.bytes = bstats_bytes; + d->tc_stats.packets = bstats_packets; } if (d->tail) { @@ -178,24 +227,28 @@ ___gnet_stats_copy_basic(const seqcount_t *running, int res; memset(&sb, 0, sizeof(sb)); - sb.bytes = bstats.bytes; - sb.packets = bstats.packets; + sb.bytes = bstats_bytes; + sb.packets = bstats_packets; res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD); - if (res < 0 || sb.packets == bstats.packets) + if (res < 0 || sb.packets == bstats_packets) return res; /* emit 64bit stats only if needed */ - return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats.packets, - sizeof(bstats.packets), TCA_STATS_PAD); + return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats_packets, + sizeof(bstats_packets), TCA_STATS_PAD); } return 0; } /** * gnet_stats_copy_basic - copy basic statistics into statistic TLV - * @running: seqcount_t pointer * @d: dumping handle * @cpu: copy statistic per cpu * @b: basic statistics + * @running: true if @b represents a running qdisc, thus @b's + * internal values might change during basic reads. + * Only used if @cpu is NULL + * + * Context: task; must not be run from IRQ or BH contexts * * Appends the basic statistics to the top level TLV created by * gnet_stats_start_copy(). @@ -204,22 +257,25 @@ ___gnet_stats_copy_basic(const seqcount_t *running, * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_basic(const seqcount_t *running, - struct gnet_dump *d, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b) +gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, + bool running) { - return ___gnet_stats_copy_basic(running, d, cpu, b, - TCA_STATS_BASIC); + return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC, running); } EXPORT_SYMBOL(gnet_stats_copy_basic); /** * gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV - * @running: seqcount_t pointer * @d: dumping handle * @cpu: copy statistic per cpu * @b: basic statistics + * @running: true if @b represents a running qdisc, thus @b's + * internal values might change during basic reads. + * Only used if @cpu is NULL + * + * Context: task; must not be run from IRQ or BH contexts * * Appends the basic statistics to the top level TLV created by * gnet_stats_start_copy(). @@ -228,13 +284,12 @@ EXPORT_SYMBOL(gnet_stats_copy_basic); * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_basic_hw(const seqcount_t *running, - struct gnet_dump *d, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b) +gnet_stats_copy_basic_hw(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, + bool running) { - return ___gnet_stats_copy_basic(running, d, cpu, b, - TCA_STATS_BASIC_HW); + return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC_HW, running); } EXPORT_SYMBOL(gnet_stats_copy_basic_hw); @@ -282,16 +337,15 @@ gnet_stats_copy_rate_est(struct gnet_dump *d, } EXPORT_SYMBOL(gnet_stats_copy_rate_est); -static void -__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, - const struct gnet_stats_queue __percpu *q) +static void gnet_stats_add_queue_cpu(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *q) { int i; for_each_possible_cpu(i) { const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i); - qstats->qlen = 0; + qstats->qlen += qcpu->backlog; qstats->backlog += qcpu->backlog; qstats->drops += qcpu->drops; qstats->requeues += qcpu->requeues; @@ -299,24 +353,21 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, } } -void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, - const struct gnet_stats_queue __percpu *cpu, - const struct gnet_stats_queue *q, - __u32 qlen) +void gnet_stats_add_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu, + const struct gnet_stats_queue *q) { if (cpu) { - __gnet_stats_copy_queue_cpu(qstats, cpu); + gnet_stats_add_queue_cpu(qstats, cpu); } else { - qstats->qlen = q->qlen; - qstats->backlog = q->backlog; - qstats->drops = q->drops; - qstats->requeues = q->requeues; - qstats->overlimits = q->overlimits; + qstats->qlen += q->qlen; + qstats->backlog += q->backlog; + qstats->drops += q->drops; + qstats->requeues += q->requeues; + qstats->overlimits += q->overlimits; } - - qstats->qlen = qlen; } -EXPORT_SYMBOL(__gnet_stats_copy_queue); +EXPORT_SYMBOL(gnet_stats_add_queue); /** * gnet_stats_copy_queue - copy queue statistics into statistics TLV @@ -339,7 +390,8 @@ gnet_stats_copy_queue(struct gnet_dump *d, { struct gnet_stats_queue qstats = {0}; - __gnet_stats_copy_queue(&qstats, cpu_q, q, qlen); + gnet_stats_add_queue(&qstats, cpu_q, q); + qstats.qlen = qlen; if (d->compat_tc_stats) { d->tc_stats.drops = qstats.drops; diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 1a455847da54..9599afd0862d 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -55,7 +55,7 @@ static void rfc2863_policy(struct net_device *dev) if (operstate == dev->operstate) return; - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); switch(dev->link_mode) { case IF_LINK_MODE_TESTING: @@ -74,7 +74,7 @@ static void rfc2863_policy(struct net_device *dev) dev->operstate = operstate; - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9c0e8ccf9bc5..8c85e93daa73 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -842,9 +842,9 @@ static void set_operstate(struct net_device *dev, unsigned char transition) } if (dev->operstate != operstate) { - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); dev->operstate = operstate; - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); netdev_state_change(dev); } } @@ -2781,11 +2781,11 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_LINKMODE]) { unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); if (dev->link_mode ^ value) status |= DO_SETLINK_NOTIFY; dev->link_mode = value; - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); } if (tb[IFLA_VFINFO_LIST]) { diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 26c32407f029..ea7b96e296ef 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -30,13 +30,13 @@ static bool is_slave_up(struct net_device *dev) static void __hsr_set_operstate(struct net_device *dev, int transition) { - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); if (dev->operstate != transition) { dev->operstate = transition; - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); netdev_state_change(dev); } else { - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); } } diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 0d5c422f8745..8aec1b529364 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -94,11 +94,11 @@ static unsigned int xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_rateest_target_info *info = par->targinfo; - struct gnet_stats_basic_packed *stats = &info->est->bstats; + struct gnet_stats_basic_sync *stats = &info->est->bstats; spin_lock_bh(&info->est->lock); - stats->bytes += skb->len; - stats->packets++; + u64_stats_add(&stats->bytes, skb->len); + u64_stats_inc(&stats->packets); spin_unlock_bh(&info->est->lock); return XT_CONTINUE; @@ -143,6 +143,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) if (!est) goto err1; + gnet_stats_basic_sync_init(&est->bstats); strlcpy(est->name, info->name, sizeof(est->name)); spin_lock_init(&est->lock); est->refcnt = 1; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 7d53272727bf..2f46f9f9afb9 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -480,16 +480,18 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, atomic_set(&p->tcfa_bindcnt, 1); if (cpustats) { - p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!p->cpu_bstats) goto err1; - p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!p->cpu_bstats_hw) goto err2; p->cpu_qstats = alloc_percpu(struct gnet_stats_queue); if (!p->cpu_qstats) goto err3; } + gnet_stats_basic_sync_init(&p->tcfa_bstats); + gnet_stats_basic_sync_init(&p->tcfa_bstats_hw); spin_lock_init(&p->tcfa_lock); p->tcfa_index = index; p->tcfa_tm.install = jiffies; @@ -499,7 +501,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, if (est) { err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats, &p->tcfa_rate_est, - &p->tcfa_lock, NULL, est); + &p->tcfa_lock, false, est); if (err) goto err4; } @@ -1135,13 +1137,13 @@ void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, u64 drops, bool hw) { if (a->cpu_bstats) { - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); + _bstats_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); this_cpu_ptr(a->cpu_qstats)->drops += drops; if (hw) - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), - bytes, packets); + _bstats_update(this_cpu_ptr(a->cpu_bstats_hw), + bytes, packets); return; } @@ -1180,9 +1182,10 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, if (err < 0) goto errout; - if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 || - gnet_stats_copy_basic_hw(NULL, &d, p->cpu_bstats_hw, - &p->tcfa_bstats_hw) < 0 || + if (gnet_stats_copy_basic(&d, p->cpu_bstats, + &p->tcfa_bstats, false) < 0 || + gnet_stats_copy_basic_hw(&d, p->cpu_bstats_hw, + &p->tcfa_bstats_hw, false) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, &p->tcfa_qstats, diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 5c36013339e1..f2bf896331a5 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -41,7 +41,7 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, int action, filter_res; tcf_lastuse_update(&prog->tcf_tm); - bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(prog->common.cpu_bstats), skb); filter = rcu_dereference(prog->filter); if (at_ingress) { diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 7064a365a1a9..b757f90a2d58 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -718,7 +718,7 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, u8 *tlv_data; u16 metalen; - bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); if (skb_at_tc_ingress(skb)) @@ -806,7 +806,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, exceed_mtu = true; } - bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); if (!metalen) { /* no metadata to send */ diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index e4529b428cf4..8faa4c58305e 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -59,7 +59,7 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, int ret, mac_len; tcf_lastuse_update(&m->tcf_tm); - bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(m->common.cpu_bstats), skb); /* Ensure 'data' points at mac_header prior calling mpls manipulating * functions. diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 5c0a3ea9fe12..cbeb9995df37 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -125,7 +125,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, police->common.cpu_bstats, &police->tcf_rate_est, &police->tcf_lock, - NULL, est); + false, est); if (err) goto failure; } else if (tb[TCA_POLICE_AVRATE] && @@ -262,7 +262,7 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, int ret; tcf_lastuse_update(&police->tcf_tm); - bstats_cpu_update(this_cpu_ptr(police->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(police->common.cpu_bstats), skb); ret = READ_ONCE(police->tcf_action); p = rcu_dereference_bh(police->params); diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 230501eb9e06..ce859b0e0deb 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -163,7 +163,7 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, int retval; tcf_lastuse_update(&s->tcf_tm); - bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(s->common.cpu_bstats), skb); retval = READ_ONCE(s->tcf_action); psample_group = rcu_dereference_bh(s->psample_group); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index cbbe1861d3a2..e617ab4505ca 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -36,7 +36,8 @@ static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a, * then it would look like "hello_3" (without quotes) */ pr_info("simple: %s_%llu\n", - (char *)d->tcfd_defdata, d->tcf_bstats.packets); + (char *)d->tcfd_defdata, + u64_stats_read(&d->tcf_bstats.packets)); spin_unlock(&d->tcf_lock); return d->tcf_action; } diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 605418538347..d30ecbfc8f84 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -31,7 +31,7 @@ static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a, int action; tcf_lastuse_update(&d->tcf_tm); - bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); params = rcu_dereference_bh(d->params); action = READ_ONCE(d->tcf_action); diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index ecb9ee666095..9b6b52c5e24e 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -31,7 +31,7 @@ static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a, u64 flags; tcf_lastuse_update(&d->tcf_tm); - bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); action = READ_ONCE(d->tcf_action); if (unlikely(action == TC_ACT_SHOT)) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 0fb387c9d706..eb2c5c8fcd32 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -884,7 +884,7 @@ static void qdisc_offload_graft_root(struct net_device *dev, static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, u32 portid, u32 seq, u16 flags, int event) { - struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; + struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL; struct gnet_stats_queue __percpu *cpu_qstats = NULL; struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -942,8 +942,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, cpu_qstats = q->cpu_qstats; } - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q), - &d, cpu_bstats, &q->bstats) < 0 || + if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 || gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) goto nla_put_failure; @@ -1264,26 +1263,17 @@ static struct Qdisc *qdisc_create(struct net_device *dev, rcu_assign_pointer(sch->stab, stab); } if (tca[TCA_RATE]) { - seqcount_t *running; - err = -EOPNOTSUPP; if (sch->flags & TCQ_F_MQROOT) { NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc"); goto err_out4; } - if (sch->parent != TC_H_ROOT && - !(sch->flags & TCQ_F_INGRESS) && - (!p || !(p->flags & TCQ_F_MQROOT))) - running = qdisc_root_sleeping_running(sch); - else - running = &sch->running; - err = gen_new_estimator(&sch->bstats, sch->cpu_bstats, &sch->rate_est, NULL, - running, + true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to generate new estimator"); @@ -1359,7 +1349,7 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca, sch->cpu_bstats, &sch->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); } out: diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 7d8518176b45..4c8e994cf0a5 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -52,7 +52,7 @@ struct atm_flow_data { struct atm_qdisc_data *parent; /* parent qdisc */ struct socket *sock; /* for closing */ int ref; /* reference count */ - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct list_head list; struct atm_flow_data *excess; /* flow for excess traffic; @@ -548,6 +548,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt, pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); INIT_LIST_HEAD(&p->flows); INIT_LIST_HEAD(&p->link.list); + gnet_stats_basic_sync_init(&p->link.bstats); list_add(&p->link.list, &p->flows); p->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle, extack); @@ -652,8 +653,7 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, { struct atm_flow_data *flow = (struct atm_flow_data *)arg; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &flow->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &flow->bstats, true) < 0 || gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0) return -1; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index e0da15530f0e..02d9f0dfe356 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -116,7 +116,7 @@ struct cbq_class { long avgidle; long deficit; /* Saved deficit for WRR */ psched_time_t penalized; - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct tc_cbq_xstats xstats; @@ -565,8 +565,7 @@ cbq_update(struct cbq_sched_data *q) long avgidle = cl->avgidle; long idle; - cl->bstats.packets++; - cl->bstats.bytes += len; + _bstats_update(&cl->bstats, len, 1); /* * (now - last) is total time between packet right edges. @@ -1384,8 +1383,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (cl->undertime != PSCHED_PASTPERFECT) cl->xstats.undertime = cl->undertime - q->now; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) return -1; @@ -1519,7 +1517,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator"); @@ -1611,6 +1609,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (cl == NULL) goto failure; + gnet_stats_basic_sync_init(&cl->bstats); err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); @@ -1619,9 +1618,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - NULL, - qdisc_root_sleeping_running(sch), - tca[TCA_RATE]); + NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Couldn't create new estimator"); tcf_block_put(cl->block); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 642cd179b7a7..18e4f7a0b291 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -19,7 +19,7 @@ struct drr_class { struct Qdisc_class_common common; unsigned int filter_cnt; - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct list_head alist; @@ -85,8 +85,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - NULL, - qdisc_root_sleeping_running(sch), + NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to replace estimator"); @@ -106,6 +105,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl == NULL) return -ENOBUFS; + gnet_stats_basic_sync_init(&cl->bstats); cl->common.classid = classid; cl->quantum = quantum; cl->qdisc = qdisc_create_dflt(sch->dev_queue, @@ -118,9 +118,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - NULL, - qdisc_root_sleeping_running(sch), - tca[TCA_RATE]); + NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to replace estimator"); qdisc_put(cl->qdisc); @@ -267,8 +265,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (qlen) xstats.deficit = cl->deficit; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, cl_q->cpu_qstats, &cl_q->qstats, qlen) < 0) return -1; diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index 44fa2532a87c..d73393493553 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -41,7 +41,7 @@ struct ets_class { struct Qdisc *qdisc; u32 quantum; u32 deficit; - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; }; @@ -325,8 +325,7 @@ static int ets_class_dump_stats(struct Qdisc *sch, unsigned long arg, struct ets_class *cl = ets_class_from_arg(sch, arg); struct Qdisc *cl_q = cl->qdisc; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; @@ -661,7 +660,6 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, q->nbands = nbands; for (i = nstrict; i < q->nstrict; i++) { - INIT_LIST_HEAD(&q->classes[i].alist); if (q->classes[i].qdisc->q.qlen) { list_add_tail(&q->classes[i].alist, &q->active); q->classes[i].deficit = quanta[i]; @@ -689,7 +687,11 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, ets_offload_change(sch); for (i = q->nbands; i < oldbands; i++) { qdisc_put(q->classes[i].qdisc); - memset(&q->classes[i], 0, sizeof(q->classes[i])); + q->classes[i].qdisc = NULL; + q->classes[i].quantum = 0; + q->classes[i].deficit = 0; + gnet_stats_basic_sync_init(&q->classes[i].bstats); + memset(&q->classes[i].qstats, 0, sizeof(q->classes[i].qstats)); } return 0; } @@ -698,7 +700,7 @@ static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct ets_sched *q = qdisc_priv(sch); - int err; + int err, i; if (!opt) return -EINVAL; @@ -708,6 +710,9 @@ static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt, return err; INIT_LIST_HEAD(&q->active); + for (i = 0; i < TCQ_ETS_MAX_BANDS; i++) + INIT_LIST_HEAD(&q->classes[i].alist); + return ets_qdisc_change(sch, opt, extack); } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 30c29a9a2efd..dd27a062e913 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -304,8 +304,8 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, /* * Transmit possibly several skbs, and handle the return status as - * required. Owning running seqcount bit guarantees that - * only one CPU can execute this function. + * required. Owning qdisc running bit guarantees that only one CPU + * can execute this function. * * Returns to the caller: * false - hardware queue frozen backoff @@ -606,7 +606,6 @@ struct Qdisc noop_qdisc = { .ops = &noop_qdisc_ops, .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, - .running = SEQCNT_ZERO(noop_qdisc.running), .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), .gso_skb = { .next = (struct sk_buff *)&noop_qdisc.gso_skb, @@ -867,7 +866,6 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { EXPORT_SYMBOL(pfifo_fast_ops); static struct lock_class_key qdisc_tx_busylock; -static struct lock_class_key qdisc_running_key; struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, @@ -892,11 +890,12 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, __skb_queue_head_init(&sch->gso_skb); __skb_queue_head_init(&sch->skb_bad_txq); qdisc_skb_head_init(&sch->q); + gnet_stats_basic_sync_init(&sch->bstats); spin_lock_init(&sch->q.lock); if (ops->static_flags & TCQ_F_CPUSTATS) { sch->cpu_bstats = - netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!sch->cpu_bstats) goto errout1; @@ -916,10 +915,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, lockdep_set_class(&sch->seqlock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); - seqcount_init(&sch->running); - lockdep_set_class(&sch->running, - dev->qdisc_running_key ?: &qdisc_running_key); - sch->ops = ops; sch->flags = ops->static_flags; sch->enqueue = ops->enqueue; diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 621dc6afde8f..1073c76d05c4 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -56,6 +56,7 @@ struct gred_sched { u32 DPs; u32 def; struct red_vars wred_set; + struct tc_gred_qopt_offload *opt; }; static inline int gred_wred_mode(struct gred_sched *table) @@ -311,48 +312,50 @@ static void gred_offload(struct Qdisc *sch, enum tc_gred_command command) { struct gred_sched *table = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - struct tc_gred_qopt_offload opt = { - .command = command, - .handle = sch->handle, - .parent = sch->parent, - }; + struct tc_gred_qopt_offload *opt = table->opt; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; + memset(opt, 0, sizeof(*opt)); + opt->command = command; + opt->handle = sch->handle; + opt->parent = sch->parent; + if (command == TC_GRED_REPLACE) { unsigned int i; - opt.set.grio_on = gred_rio_mode(table); - opt.set.wred_on = gred_wred_mode(table); - opt.set.dp_cnt = table->DPs; - opt.set.dp_def = table->def; + opt->set.grio_on = gred_rio_mode(table); + opt->set.wred_on = gred_wred_mode(table); + opt->set.dp_cnt = table->DPs; + opt->set.dp_def = table->def; for (i = 0; i < table->DPs; i++) { struct gred_sched_data *q = table->tab[i]; if (!q) continue; - opt.set.tab[i].present = true; - opt.set.tab[i].limit = q->limit; - opt.set.tab[i].prio = q->prio; - opt.set.tab[i].min = q->parms.qth_min >> q->parms.Wlog; - opt.set.tab[i].max = q->parms.qth_max >> q->parms.Wlog; - opt.set.tab[i].is_ecn = gred_use_ecn(q); - opt.set.tab[i].is_harddrop = gred_use_harddrop(q); - opt.set.tab[i].probability = q->parms.max_P; - opt.set.tab[i].backlog = &q->backlog; + opt->set.tab[i].present = true; + opt->set.tab[i].limit = q->limit; + opt->set.tab[i].prio = q->prio; + opt->set.tab[i].min = q->parms.qth_min >> q->parms.Wlog; + opt->set.tab[i].max = q->parms.qth_max >> q->parms.Wlog; + opt->set.tab[i].is_ecn = gred_use_ecn(q); + opt->set.tab[i].is_harddrop = gred_use_harddrop(q); + opt->set.tab[i].probability = q->parms.max_P; + opt->set.tab[i].backlog = &q->backlog; } - opt.set.qstats = &sch->qstats; + opt->set.qstats = &sch->qstats; } - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, &opt); + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, opt); } static int gred_offload_dump_stats(struct Qdisc *sch) { struct gred_sched *table = qdisc_priv(sch); struct tc_gred_qopt_offload *hw_stats; + u64 bytes = 0, packets = 0; unsigned int i; int ret; @@ -364,9 +367,11 @@ static int gred_offload_dump_stats(struct Qdisc *sch) hw_stats->handle = sch->handle; hw_stats->parent = sch->parent; - for (i = 0; i < MAX_DPs; i++) + for (i = 0; i < MAX_DPs; i++) { + gnet_stats_basic_sync_init(&hw_stats->stats.bstats[i]); if (table->tab[i]) hw_stats->stats.xstats[i] = &table->tab[i]->stats; + } ret = qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_GRED, hw_stats); /* Even if driver returns failure adjust the stats - in case offload @@ -375,19 +380,19 @@ static int gred_offload_dump_stats(struct Qdisc *sch) for (i = 0; i < MAX_DPs; i++) { if (!table->tab[i]) continue; - table->tab[i]->packetsin += hw_stats->stats.bstats[i].packets; - table->tab[i]->bytesin += hw_stats->stats.bstats[i].bytes; + table->tab[i]->packetsin += u64_stats_read(&hw_stats->stats.bstats[i].packets); + table->tab[i]->bytesin += u64_stats_read(&hw_stats->stats.bstats[i].bytes); table->tab[i]->backlog += hw_stats->stats.qstats[i].backlog; - _bstats_update(&sch->bstats, - hw_stats->stats.bstats[i].bytes, - hw_stats->stats.bstats[i].packets); + bytes += u64_stats_read(&hw_stats->stats.bstats[i].bytes); + packets += u64_stats_read(&hw_stats->stats.bstats[i].packets); sch->qstats.qlen += hw_stats->stats.qstats[i].qlen; sch->qstats.backlog += hw_stats->stats.qstats[i].backlog; sch->qstats.drops += hw_stats->stats.qstats[i].drops; sch->qstats.requeues += hw_stats->stats.qstats[i].requeues; sch->qstats.overlimits += hw_stats->stats.qstats[i].overlimits; } + _bstats_update(&sch->bstats, bytes, packets); kfree(hw_stats); return ret; @@ -728,6 +733,7 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt, static int gred_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + struct gred_sched *table = qdisc_priv(sch); struct nlattr *tb[TCA_GRED_MAX + 1]; int err; @@ -751,6 +757,12 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt, sch->limit = qdisc_dev(sch)->tx_queue_len * psched_mtu(qdisc_dev(sch)); + if (qdisc_dev(sch)->netdev_ops->ndo_setup_tc) { + table->opt = kzalloc(sizeof(*table->opt), GFP_KERNEL); + if (!table->opt) + return -ENOMEM; + } + return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack); } @@ -907,6 +919,7 @@ static void gred_destroy(struct Qdisc *sch) gred_destroy_vq(table->tab[i]); } gred_offload(sch, TC_GRED_DESTROY); + kfree(table->opt); } static struct Qdisc_ops gred_qdisc_ops __read_mostly = { diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index b7ac30cca035..d3979a6000e7 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -111,7 +111,7 @@ enum hfsc_class_flags { struct hfsc_class { struct Qdisc_class_common cl_common; - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct tcf_proto __rcu *filter_list; /* filter list */ @@ -965,7 +965,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); if (err) return err; @@ -1033,9 +1033,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - NULL, - qdisc_root_sleeping_running(sch), - tca[TCA_RATE]); + NULL, true, tca[TCA_RATE]); if (err) { tcf_block_put(cl->block); kfree(cl); @@ -1328,7 +1326,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, xstats.work = cl->cl_total; xstats.rtwork = cl->cl_cumul; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) return -1; @@ -1406,6 +1404,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt, if (err) return err; + gnet_stats_basic_sync_init(&q->root.bstats); q->root.cl_common.classid = sch->handle; q->root.sched = q; q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 5cbc32fee867..8fd419337d3f 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -113,8 +113,8 @@ struct htb_class { /* * Written often fields */ - struct gnet_stats_basic_packed bstats; - struct gnet_stats_basic_packed bstats_bias; + struct gnet_stats_basic_sync bstats; + struct gnet_stats_basic_sync bstats_bias; struct tc_htb_xstats xstats; /* our special stats */ /* token bucket parameters */ @@ -1308,10 +1308,11 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, static void htb_offload_aggregate_stats(struct htb_sched *q, struct htb_class *cl) { + u64 bytes = 0, packets = 0; struct htb_class *c; unsigned int i; - memset(&cl->bstats, 0, sizeof(cl->bstats)); + gnet_stats_basic_sync_init(&cl->bstats); for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) { @@ -1323,14 +1324,15 @@ static void htb_offload_aggregate_stats(struct htb_sched *q, if (p != cl) continue; - cl->bstats.bytes += c->bstats_bias.bytes; - cl->bstats.packets += c->bstats_bias.packets; + bytes += u64_stats_read(&c->bstats_bias.bytes); + packets += u64_stats_read(&c->bstats_bias.packets); if (c->level == 0) { - cl->bstats.bytes += c->leaf.q->bstats.bytes; - cl->bstats.packets += c->leaf.q->bstats.packets; + bytes += u64_stats_read(&c->leaf.q->bstats.bytes); + packets += u64_stats_read(&c->leaf.q->bstats.packets); } } } + _bstats_update(&cl->bstats, bytes, packets); } static int @@ -1357,16 +1359,16 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) if (cl->leaf.q) cl->bstats = cl->leaf.q->bstats; else - memset(&cl->bstats, 0, sizeof(cl->bstats)); - cl->bstats.bytes += cl->bstats_bias.bytes; - cl->bstats.packets += cl->bstats_bias.packets; + gnet_stats_basic_sync_init(&cl->bstats); + _bstats_update(&cl->bstats, + u64_stats_read(&cl->bstats_bias.bytes), + u64_stats_read(&cl->bstats_bias.packets)); } else { htb_offload_aggregate_stats(q, cl); } } - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0) return -1; @@ -1578,8 +1580,9 @@ static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl, WARN_ON(old != q); if (cl->parent) { - cl->parent->bstats_bias.bytes += q->bstats.bytes; - cl->parent->bstats_bias.packets += q->bstats.packets; + _bstats_update(&cl->parent->bstats_bias, + u64_stats_read(&q->bstats.bytes), + u64_stats_read(&q->bstats.packets)); } offload_opt = (struct tc_htb_qopt_offload) { @@ -1869,6 +1872,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!cl) goto failure; + gnet_stats_basic_sync_init(&cl->bstats); + gnet_stats_basic_sync_init(&cl->bstats_bias); + err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); @@ -1878,7 +1884,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE] ? : &est.nla); if (err) goto err_block_put; @@ -1942,8 +1948,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, htb_graft_helper(dev_queue, old_q); goto err_kill_estimator; } - parent->bstats_bias.bytes += old_q->bstats.bytes; - parent->bstats_bias.packets += old_q->bstats.packets; + _bstats_update(&parent->bstats_bias, + u64_stats_read(&old_q->bstats.bytes), + u64_stats_read(&old_q->bstats.packets)); qdisc_put(old_q); } new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, @@ -2003,7 +2010,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); if (err) return err; diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index db18d8a860f9..24c5d97d88dd 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -153,10 +153,9 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) struct net_device *dev = qdisc_dev(sch); struct Qdisc *qdisc; unsigned int ntx; - __u32 qlen = 0; sch->q.qlen = 0; - memset(&sch->bstats, 0, sizeof(sch->bstats)); + gnet_stats_basic_sync_init(&sch->bstats); memset(&sch->qstats, 0, sizeof(sch->qstats)); /* MQ supports lockless qdiscs. However, statistics accounting needs @@ -168,25 +167,11 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; spin_lock_bh(qdisc_lock(qdisc)); - if (qdisc_is_percpu_stats(qdisc)) { - qlen = qdisc_qlen_sum(qdisc); - __gnet_stats_copy_basic(NULL, &sch->bstats, - qdisc->cpu_bstats, - &qdisc->bstats); - __gnet_stats_copy_queue(&sch->qstats, - qdisc->cpu_qstats, - &qdisc->qstats, qlen); - sch->q.qlen += qlen; - } else { - sch->q.qlen += qdisc->q.qlen; - sch->bstats.bytes += qdisc->bstats.bytes; - sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.qlen += qdisc->qstats.qlen; - sch->qstats.backlog += qdisc->qstats.backlog; - sch->qstats.drops += qdisc->qstats.drops; - sch->qstats.requeues += qdisc->qstats.requeues; - sch->qstats.overlimits += qdisc->qstats.overlimits; - } + gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats, + &qdisc->bstats, false); + gnet_stats_add_queue(&sch->qstats, qdisc->cpu_qstats, + &qdisc->qstats); + sch->q.qlen += qdisc_qlen(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); } @@ -269,8 +254,7 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct netdev_queue *dev_queue = mq_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(&sch->running, d, sch->cpu_bstats, - &sch->bstats) < 0 || + if (gnet_stats_copy_basic(d, sch->cpu_bstats, &sch->bstats, true) < 0 || qdisc_qstats_copy(d, sch) < 0) return -1; return 0; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 50e15add6068..42d4101e4f3d 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -412,7 +412,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) unsigned int ntx, tc; sch->q.qlen = 0; - memset(&sch->bstats, 0, sizeof(sch->bstats)); + gnet_stats_basic_sync_init(&sch->bstats); memset(&sch->qstats, 0, sizeof(sch->qstats)); /* MQ supports lockless qdiscs. However, statistics accounting needs @@ -424,25 +424,11 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; spin_lock_bh(qdisc_lock(qdisc)); - if (qdisc_is_percpu_stats(qdisc)) { - __u32 qlen = qdisc_qlen_sum(qdisc); - - __gnet_stats_copy_basic(NULL, &sch->bstats, - qdisc->cpu_bstats, - &qdisc->bstats); - __gnet_stats_copy_queue(&sch->qstats, - qdisc->cpu_qstats, - &qdisc->qstats, qlen); - sch->q.qlen += qlen; - } else { - sch->q.qlen += qdisc->q.qlen; - sch->bstats.bytes += qdisc->bstats.bytes; - sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.backlog += qdisc->qstats.backlog; - sch->qstats.drops += qdisc->qstats.drops; - sch->qstats.requeues += qdisc->qstats.requeues; - sch->qstats.overlimits += qdisc->qstats.overlimits; - } + gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats, + &qdisc->bstats, false); + gnet_stats_add_queue(&sch->qstats, qdisc->cpu_qstats, + &qdisc->qstats); + sch->q.qlen += qdisc_qlen(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); } @@ -534,12 +520,13 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, { if (cl >= TC_H_MIN_PRIORITY) { int i; - __u32 qlen = 0; + __u32 qlen; struct gnet_stats_queue qstats = {0}; - struct gnet_stats_basic_packed bstats = {0}; + struct gnet_stats_basic_sync bstats; struct net_device *dev = qdisc_dev(sch); struct netdev_tc_txq tc = dev->tc_to_txq[cl & TC_BITMASK]; + gnet_stats_basic_sync_init(&bstats); /* Drop lock here it will be reclaimed before touching * statistics this is required because the d->lock we * hold here is the look on dev_queue->qdisc_sleeping @@ -554,40 +541,28 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, spin_lock_bh(qdisc_lock(qdisc)); - if (qdisc_is_percpu_stats(qdisc)) { - qlen = qdisc_qlen_sum(qdisc); - - __gnet_stats_copy_basic(NULL, &bstats, - qdisc->cpu_bstats, - &qdisc->bstats); - __gnet_stats_copy_queue(&qstats, - qdisc->cpu_qstats, - &qdisc->qstats, - qlen); - } else { - qlen += qdisc->q.qlen; - bstats.bytes += qdisc->bstats.bytes; - bstats.packets += qdisc->bstats.packets; - qstats.backlog += qdisc->qstats.backlog; - qstats.drops += qdisc->qstats.drops; - qstats.requeues += qdisc->qstats.requeues; - qstats.overlimits += qdisc->qstats.overlimits; - } + gnet_stats_add_basic(&bstats, qdisc->cpu_bstats, + &qdisc->bstats, false); + gnet_stats_add_queue(&qstats, qdisc->cpu_qstats, + &qdisc->qstats); + sch->q.qlen += qdisc_qlen(qdisc); + spin_unlock_bh(qdisc_lock(qdisc)); } + qlen = qdisc_qlen(sch) + qstats.qlen; /* Reclaim root sleeping lock before completing stats */ if (d->lock) spin_lock_bh(d->lock); - if (gnet_stats_copy_basic(NULL, d, NULL, &bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &bstats, false) < 0 || gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0) return -1; } else { struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, - sch->cpu_bstats, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(d, sch->cpu_bstats, + &sch->bstats, true) < 0 || qdisc_qstats_copy(d, sch) < 0) return -1; } diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index e282e7382117..cd8ab90c4765 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -338,8 +338,7 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, cl_q->cpu_bstats, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 03fdf31ccb6a..3b8d7197c06b 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -361,8 +361,8 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, cl_q->cpu_bstats, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, + &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index aea435b0aeb3..d4ce58c90f9f 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -131,7 +131,7 @@ struct qfq_class { unsigned int filter_cnt; - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct Qdisc *qdisc; @@ -451,7 +451,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); if (err) return err; @@ -465,6 +465,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl == NULL) return -ENOBUFS; + gnet_stats_basic_sync_init(&cl->bstats); cl->common.classid = classid; cl->deficit = lmax; @@ -477,7 +478,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); if (err) goto destroy_class; @@ -639,8 +640,7 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, xstats.weight = cl->agg->class_weight; xstats.lmax = cl->agg->lmax; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || qdisc_qstats_copy(d, cl->qdisc) < 0) return -1; @@ -1234,8 +1234,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - cl->bstats.bytes += len; - cl->bstats.packets += gso_segs; + _bstats_update(&cl->bstats, len, gso_segs); sch->qstats.backlog += len; ++sch->q.qlen; diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 474ba4db5de2..b9c71a304d39 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1985,7 +1985,7 @@ static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &sch->bstats, true) < 0 || qdisc_qstats_copy(d, sch) < 0) return -1; return 0; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 935bba065636..962e79730eb5 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -441,7 +441,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) return; - cpu = get_cpu(); + cpu = get_cpu_light(); pool = svc_pool_for_cpu(xprt->xpt_server, cpu); atomic_long_inc(&pool->sp_stats.packets); @@ -465,7 +465,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) rqstp = NULL; out_unlock: rcu_read_unlock(); - put_cpu(); + put_cpu_light(); trace_svc_xprt_do_enqueue(xprt, rqstp); } EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue); diff --git a/samples/kfifo/bytestream-example.c b/samples/kfifo/bytestream-example.c index 5a90aa527877..642d0748c169 100644 --- a/samples/kfifo/bytestream-example.c +++ b/samples/kfifo/bytestream-example.c @@ -22,10 +22,10 @@ #define PROC_FIFO "bytestream-fifo" /* lock for procfs read access */ -static DEFINE_MUTEX(read_lock); +static DEFINE_MUTEX(read_access); /* lock for procfs write access */ -static DEFINE_MUTEX(write_lock); +static DEFINE_MUTEX(write_access); /* * define DYNAMIC in this example for a dynamically allocated fifo. @@ -116,12 +116,12 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, int ret; unsigned int copied; - if (mutex_lock_interruptible(&write_lock)) + if (mutex_lock_interruptible(&write_access)) return -ERESTARTSYS; ret = kfifo_from_user(&test, buf, count, &copied); - mutex_unlock(&write_lock); + mutex_unlock(&write_access); if (ret) return ret; @@ -134,12 +134,12 @@ static ssize_t fifo_read(struct file *file, char __user *buf, int ret; unsigned int copied; - if (mutex_lock_interruptible(&read_lock)) + if (mutex_lock_interruptible(&read_access)) return -ERESTARTSYS; ret = kfifo_to_user(&test, buf, count, &copied); - mutex_unlock(&read_lock); + mutex_unlock(&read_access); if (ret) return ret; diff --git a/samples/kfifo/inttype-example.c b/samples/kfifo/inttype-example.c index e5403d8c971a..c61482ba94f4 100644 --- a/samples/kfifo/inttype-example.c +++ b/samples/kfifo/inttype-example.c @@ -22,10 +22,10 @@ #define PROC_FIFO "int-fifo" /* lock for procfs read access */ -static DEFINE_MUTEX(read_lock); +static DEFINE_MUTEX(read_access); /* lock for procfs write access */ -static DEFINE_MUTEX(write_lock); +static DEFINE_MUTEX(write_access); /* * define DYNAMIC in this example for a dynamically allocated fifo. @@ -109,12 +109,12 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, int ret; unsigned int copied; - if (mutex_lock_interruptible(&write_lock)) + if (mutex_lock_interruptible(&write_access)) return -ERESTARTSYS; ret = kfifo_from_user(&test, buf, count, &copied); - mutex_unlock(&write_lock); + mutex_unlock(&write_access); if (ret) return ret; @@ -127,12 +127,12 @@ static ssize_t fifo_read(struct file *file, char __user *buf, int ret; unsigned int copied; - if (mutex_lock_interruptible(&read_lock)) + if (mutex_lock_interruptible(&read_access)) return -ERESTARTSYS; ret = kfifo_to_user(&test, buf, count, &copied); - mutex_unlock(&read_lock); + mutex_unlock(&read_access); if (ret) return ret; diff --git a/samples/kfifo/record-example.c b/samples/kfifo/record-example.c index f64f3d62d6c2..e4087b2d3fc4 100644 --- a/samples/kfifo/record-example.c +++ b/samples/kfifo/record-example.c @@ -22,10 +22,10 @@ #define PROC_FIFO "record-fifo" /* lock for procfs read access */ -static DEFINE_MUTEX(read_lock); +static DEFINE_MUTEX(read_access); /* lock for procfs write access */ -static DEFINE_MUTEX(write_lock); +static DEFINE_MUTEX(write_access); /* * define DYNAMIC in this example for a dynamically allocated fifo. @@ -123,12 +123,12 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, int ret; unsigned int copied; - if (mutex_lock_interruptible(&write_lock)) + if (mutex_lock_interruptible(&write_access)) return -ERESTARTSYS; ret = kfifo_from_user(&test, buf, count, &copied); - mutex_unlock(&write_lock); + mutex_unlock(&write_access); if (ret) return ret; @@ -141,12 +141,12 @@ static ssize_t fifo_read(struct file *file, char __user *buf, int ret; unsigned int copied; - if (mutex_lock_interruptible(&read_lock)) + if (mutex_lock_interruptible(&read_access)) return -ERESTARTSYS; ret = kfifo_to_user(&test, buf, count, &copied); - mutex_unlock(&read_lock); + mutex_unlock(&read_access); if (ret) return ret; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 3f3f56f6be4d..5dbcdc5b22b5 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -51,8 +51,10 @@ #define SMK_RECEIVING 1 #define SMK_SENDING 2 +#ifdef SMACK_IPV6_PORT_LABELING static DEFINE_MUTEX(smack_ipv6_lock); static LIST_HEAD(smk_ipv6_port_list); +#endif struct kmem_cache *smack_rule_cache; int smack_enabled __initdata; @@ -2603,7 +2605,6 @@ static void smk_ipv6_port_label(struct socket *sock, struct sockaddr *address) mutex_unlock(&smack_ipv6_lock); return; } -#endif /** * smk_ipv6_port_check - check Smack port access @@ -2666,6 +2667,7 @@ static int smk_ipv6_port_check(struct sock *sk, struct sockaddr_in6 *address, return smk_ipv6_check(skp, object, address, act); } +#endif /** * smack_inode_setsecurity - set smack xattrs @@ -2852,8 +2854,9 @@ static int smack_socket_connect(struct socket *sock, struct sockaddr *sap, rc = smk_ipv6_check(ssp->smk_out, rsp, sip, SMK_CONNECTING); } - if (__is_defined(SMACK_IPV6_PORT_LABELING)) - rc = smk_ipv6_port_check(sock->sk, sip, SMK_CONNECTING); +#ifdef SMACK_IPV6_PORT_LABELING + rc = smk_ipv6_port_check(sock->sk, sip, SMK_CONNECTING); +#endif return rc; } diff --git a/sound/soc/mediatek/common/mtk-afe-fe-dai.c b/sound/soc/mediatek/common/mtk-afe-fe-dai.c index e95c7c018e7d..4f2c2379531b 100644 --- a/sound/soc/mediatek/common/mtk-afe-fe-dai.c +++ b/sound/soc/mediatek/common/mtk-afe-fe-dai.c @@ -288,7 +288,6 @@ const struct snd_soc_dai_ops mtk_afe_fe_ops = { }; EXPORT_SYMBOL_GPL(mtk_afe_fe_ops); -static DEFINE_MUTEX(irqs_lock); int mtk_dynamic_irq_acquire(struct mtk_base_afe *afe) { int i;