diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index e05e581af5cfe617f38112907aadea8d03f2a9d6..3a6e1fcad3c027ad954f1e40b5ff76f3b163fb81 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -60,8 +60,8 @@ privileged data touched during the speculative execution. Spectre variant 1 attacks take advantage of speculative execution of conditional branches, while Spectre variant 2 attacks use speculative execution of indirect branches to leak privileged memory. -See :ref:`[1] ` :ref:`[5] ` :ref:`[7] ` -:ref:`[10] ` :ref:`[11] `. +See :ref:`[1] ` :ref:`[5] ` :ref:`[6] ` +:ref:`[7] ` :ref:`[10] ` :ref:`[11] `. Spectre variant 1 (Bounds Check Bypass) --------------------------------------- @@ -131,6 +131,19 @@ steer its indirect branch speculations to gadget code, and measure the speculative execution's side effects left in level 1 cache to infer the victim's data. +Yet another variant 2 attack vector is for the attacker to poison the +Branch History Buffer (BHB) to speculatively steer an indirect branch +to a specific Branch Target Buffer (BTB) entry, even if the entry isn't +associated with the source address of the indirect branch. Specifically, +the BHB might be shared across privilege levels even in the presence of +Enhanced IBRS. + +Currently the only known real-world BHB attack vector is via +unprivileged eBPF. Therefore, it's highly recommended to not enable +unprivileged eBPF, especially when eIBRS is used (without retpolines). +For a full mitigation against BHB attacks, it's recommended to use +retpolines (or eIBRS combined with retpolines). + Attack scenarios ---------------- @@ -364,13 +377,15 @@ The possible values in this file are: - Kernel status: - ==================================== ================================= - 'Not affected' The processor is not vulnerable - 'Vulnerable' Vulnerable, no mitigation - 'Mitigation: Full generic retpoline' Software-focused mitigation - 'Mitigation: Full AMD retpoline' AMD-specific software mitigation - 'Mitigation: Enhanced IBRS' Hardware-focused mitigation - ==================================== ================================= + ======================================== ================================= + 'Not affected' The processor is not vulnerable + 'Mitigation: None' Vulnerable, no mitigation + 'Mitigation: Retpolines' Use Retpoline thunks + 'Mitigation: LFENCE' Use LFENCE instructions + 'Mitigation: Enhanced IBRS' Hardware-focused mitigation + 'Mitigation: Enhanced IBRS + Retpolines' Hardware-focused + Retpolines + 'Mitigation: Enhanced IBRS + LFENCE' Hardware-focused + LFENCE + ======================================== ================================= - Firmware status: Show if Indirect Branch Restricted Speculation (IBRS) is used to protect against Spectre variant 2 attacks when calling firmware (x86 only). @@ -584,12 +599,13 @@ kernel command line. Specific mitigations can also be selected manually: - retpoline - replace indirect branches - retpoline,generic - google's original retpoline - retpoline,amd - AMD-specific minimal thunk + retpoline auto pick between generic,lfence + retpoline,generic Retpolines + retpoline,lfence LFENCE; indirect branch + retpoline,amd alias for retpoline,lfence + eibrs enhanced IBRS + eibrs,retpoline enhanced IBRS + Retpolines + eibrs,lfence enhanced IBRS + LFENCE Not specifying this option is equivalent to spectre_v2=auto. @@ -730,7 +746,7 @@ AMD white papers: .. _spec_ref6: -[6] `Software techniques for managing speculation on AMD processors `_. +[6] `Software techniques for managing speculation on AMD processors `_. ARM white papers: diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a46b2fe191ba70710832ceeeb961cbedbeff407e..212832a5ca68ad80308a53af417fd5e69130d794 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -516,6 +516,10 @@ cio_ignore= [S390] See Documentation/s390/CommonIO for details. + + clear_freelist + Enable clear_freelist feature. + clk_ignore_unused [CLK] Prevents the clock framework from automatically gating @@ -1401,6 +1405,13 @@ hpet_mmap= [X86, HPET_MMAP] Allow userspace to mmap HPET registers. Default set by CONFIG_HPET_MMAP_DEFAULT. + hugepage_prohibit_sz= + [HW] HugeTLB pages should not alloc when the rest of + the normal pages less than hugepage_prohibit_sz. This + setting is to make sure a system can start even when + part of physical memory is broken, admin users can + adjust this according to typical environment. + hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot. If using node format, the number of pages to allocate per-node can be specified. @@ -4402,8 +4413,12 @@ Specific mitigations can also be selected manually: retpoline - replace indirect branches - retpoline,generic - google's original retpoline - retpoline,amd - AMD-specific minimal thunk + retpoline,generic - Retpolines + retpoline,lfence - LFENCE; indirect branch + retpoline,amd - alias for retpoline,lfence + eibrs - enhanced IBRS + eibrs,retpoline - enhanced IBRS + Retpolines + eibrs,lfence - enhanced IBRS + LFENCE Not specifying this option is equivalent to spectre_v2=auto. diff --git a/Documentation/devicetree/bindings/arm/arm,mpam.txt b/Documentation/devicetree/bindings/arm/arm,mpam.txt index 65c1e6809685e42444d654f9095bb4233b9a648d..e9ba09bb3159136f7290ea57927f276482afb048 100644 --- a/Documentation/devicetree/bindings/arm/arm,mpam.txt +++ b/Documentation/devicetree/bindings/arm/arm,mpam.txt @@ -28,27 +28,30 @@ and Monitoring (MPAM), for Armv8-A", MPAM interrupts(section 8.8). Example: -mpam_memory0 { +mpam { compatible = "arm,mpam"; - reg = <0x0 0x10000000 0x0 0x10000>; - type = <2>; /* memory type */ - numa-node-id = <0>; - overflow-interrupt = <0>; - overflow-flags = <0>; - error-interrupt = <0>; - error-interrupt-flags = <0>; - not-ready-max = <0>; -}; -mpam_cache0 { - compatible = "arm,mpam"; - reg = <0x0 0x20000000 0x0 0x10000>; - type = <1>; /* cache type */ - cache-id = <0>; - cache-level = <3>; - overflow-interrupt = <0>; - overflow-flags = <0>; - error-interrupt = <0>; - error-interrupt-flags = <0>; - not-ready-max = <0>; + mpam_memory0 { + reg = <0x0 0x10000000 0x0 0x10000>; + type = <2>; /* memory type */ + numa-node-id = <0>; + overflow-interrupt = <0>; + overflow-flags = <0>; + error-interrupt = <0>; + error-interrupt-flags = <0>; + not-ready-max = <0>; + }; + + mpam_cache0 { + reg = <0x0 0x20000000 0x0 0x10000>; + type = <1>; /* cache type */ + cache-id = <0>; + cache-level = <3>; + overflow-interrupt = <0>; + overflow-flags = <0>; + error-interrupt = <0>; + error-interrupt-flags = <0>; + not-ready-max = <0>; + }; + }; diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt index e3abfbd32f71ebd03b7da401d232e0334a897260..b020e6ce6dd49b6c50513953653ec4a5cf8ed7c4 100644 --- a/Documentation/networking/bonding.txt +++ b/Documentation/networking/bonding.txt @@ -191,11 +191,12 @@ ad_actor_sys_prio ad_actor_system In an AD system, this specifies the mac-address for the actor in - protocol packet exchanges (LACPDUs). The value cannot be NULL or - multicast. It is preferred to have the local-admin bit set for this - mac but driver does not enforce it. If the value is not given then - system defaults to using the masters' mac address as actors' system - address. + protocol packet exchanges (LACPDUs). The value cannot be a multicast + address. If the all-zeroes MAC is specified, bonding will internally + use the MAC of the bond itself. It is preferred to have the + local-admin bit set for this mac but driver does not enforce it. If + the value is not given then system defaults to using the masters' + mac address as actors' system address. This parameter has effect only in 802.3ad mode and is available through SysFs interface. diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 7e33d039ea171ae1a2fe1edfc54b3f92f6020dfd..1fa03508ff5ac6b85106628e20255a5a51ed4fde 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -95,6 +95,7 @@ show up in /proc/sys/kernel: - sysctl_writes_strict - tainted - threads-max +- unprivileged_bpf_disabled - unknown_nmi_panic - watchdog - watchdog_thresh @@ -1072,6 +1073,26 @@ available RAM pages threads-max is reduced accordingly. ============================================================== +unprivileged_bpf_disabled: + +Writing 1 to this entry will disable unprivileged calls to bpf(); +once disabled, calling bpf() without CAP_SYS_ADMIN will return +-EPERM. Once set to 1, this can't be cleared from the running kernel +anymore. + +Writing 2 to this entry will also disable unprivileged calls to bpf(), +however, an admin can still change this setting later on, if needed, by +writing 0 or 1 to this entry. + +If BPF_UNPRIV_DEFAULT_OFF is enabled in the kernel config, then this +entry will default to 2 instead of 0. + + 0 - Unprivileged calls to bpf() are enabled + 1 - Unprivileged calls to bpf() are disabled without recovery + 2 - Unprivileged calls to bpf() are disabled + +============================================================== + unknown_nmi_panic: The value in this file affects behavior of handling NMI. When the diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 1c7ce2716ad379843a5974ac3494917f98491922..977df8306905f7a4506c36e103573dd441b1ce25 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -35,6 +35,7 @@ #define MAP_FIXED_NOREPLACE 0x200000/* MAP_FIXED which doesn't unmap underlying mapping */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ #define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */ +#define MAP_ALIGN 0x2000000 /* create an aligned mapping */ #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_SYNC 2 /* synchronous memory sync */ diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index 88286dd483ff901bd1d215fb5450db32add47bd2..f903e1040b36b769b4bb7c476185c6ed8c43619f 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -110,6 +110,16 @@ .endm #endif +#if __LINUX_ARM_ARCH__ < 7 + .macro dsb, args + mcr p15, 0, r0, c7, c10, 4 + .endm + + .macro isb, args + mcr p15, 0, r0, c7, c5, 4 + .endm +#endif + .macro asm_trace_hardirqs_off, save=1 #if defined(CONFIG_TRACE_IRQFLAGS) .if \save diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 0066de61f4c622de757f7667378f491a76a001be..46a2e8636f868cabb677c07b53050454d0c1a083 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -405,4 +406,10 @@ static inline int kvm_arm_config_vm(struct kvm *kvm, unsigned long type) return 0; } +static inline int kvm_arm_get_spectre_bhb_state(void) +{ + /* 32bit guests don't need firmware for this */ + return SPECTRE_VULNERABLE; /* aka SMCCC_RET_NOT_SUPPORTED */ +} + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/spectre.h b/arch/arm/include/asm/spectre.h new file mode 100644 index 0000000000000000000000000000000000000000..85f9e538fb325730613f78419f52826a68b8d76c --- /dev/null +++ b/arch/arm/include/asm/spectre.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __ASM_SPECTRE_H +#define __ASM_SPECTRE_H + +enum { + SPECTRE_UNAFFECTED, + SPECTRE_MITIGATED, + SPECTRE_VULNERABLE, +}; + +enum { + __SPECTRE_V2_METHOD_BPIALL, + __SPECTRE_V2_METHOD_ICIALLU, + __SPECTRE_V2_METHOD_SMC, + __SPECTRE_V2_METHOD_HVC, + __SPECTRE_V2_METHOD_LOOP8, +}; + +enum { + SPECTRE_V2_METHOD_BPIALL = BIT(__SPECTRE_V2_METHOD_BPIALL), + SPECTRE_V2_METHOD_ICIALLU = BIT(__SPECTRE_V2_METHOD_ICIALLU), + SPECTRE_V2_METHOD_SMC = BIT(__SPECTRE_V2_METHOD_SMC), + SPECTRE_V2_METHOD_HVC = BIT(__SPECTRE_V2_METHOD_HVC), + SPECTRE_V2_METHOD_LOOP8 = BIT(__SPECTRE_V2_METHOD_LOOP8), +}; + +#ifdef CONFIG_GENERIC_CPU_VULNERABILITIES +void spectre_v2_update_state(unsigned int state, unsigned int methods); +#else +static inline void spectre_v2_update_state(unsigned int state, + unsigned int methods) +{} +#endif + +int spectre_bhb_update_vectors(unsigned int method); + +#endif diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index 8cad59465af39ac44eafecc881e0ed9a6afc9893..70fd896c132a67766a3d91827f28a7c4a7401689 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -102,4 +102,6 @@ endif obj-$(CONFIG_HAVE_ARM_SMCCC) += smccc-call.o +obj-$(CONFIG_GENERIC_CPU_VULNERABILITIES) += spectre.o + extra-y := $(head-y) vmlinux.lds diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index e85a3af9ddeb5694b793363f8245ba1ad5f99899..c2d36d31e908d78d1d364e63617cfb06ad3a8188 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -1038,12 +1038,11 @@ vector_\name: sub lr, lr, #\correction .endif - @ - @ Save r0, lr_ (parent PC) and spsr_ - @ (parent CPSR) - @ + @ Save r0, lr_ (parent PC) stmia sp, {r0, lr} @ save r0, lr - mrs lr, spsr + + @ Save spsr_ (parent CPSR) +2: mrs lr, spsr str lr, [sp, #8] @ save spsr @ @@ -1064,6 +1063,44 @@ vector_\name: movs pc, lr @ branch to handler in SVC mode ENDPROC(vector_\name) +#ifdef CONFIG_HARDEN_BRANCH_HISTORY + .subsection 1 + .align 5 +vector_bhb_loop8_\name: + .if \correction + sub lr, lr, #\correction + .endif + + @ Save r0, lr_ (parent PC) + stmia sp, {r0, lr} + + @ bhb workaround + mov r0, #8 +3: b . + 4 + subs r0, r0, #1 + bne 3b + dsb + isb + b 2b +ENDPROC(vector_bhb_loop8_\name) + +vector_bhb_bpiall_\name: + .if \correction + sub lr, lr, #\correction + .endif + + @ Save r0, lr_ (parent PC) + stmia sp, {r0, lr} + + @ bhb workaround + mcr p15, 0, r0, c7, c5, 6 @ BPIALL + @ isb not needed due to "movs pc, lr" in the vector stub + @ which gives a "context synchronisation". + b 2b +ENDPROC(vector_bhb_bpiall_\name) + .previous +#endif + .align 2 @ handler addresses follow this label 1: @@ -1072,6 +1109,10 @@ ENDPROC(vector_\name) .section .stubs, "ax", %progbits @ This must be the first word .word vector_swi +#ifdef CONFIG_HARDEN_BRANCH_HISTORY + .word vector_bhb_loop8_swi + .word vector_bhb_bpiall_swi +#endif vector_rst: ARM( swi SYS_ERROR0 ) @@ -1186,8 +1227,10 @@ vector_addrexcptn: * FIQ "NMI" handler *----------------------------------------------------------------------------- * Handle a FIQ using the SVC stack allowing FIQ act like NMI on x86 - * systems. + * systems. This must be the last vector stub, so lets place it in its own + * subsection. */ + .subsection 2 vector_stub fiq, FIQ_MODE, 4 .long __fiq_usr @ 0 (USR_26 / USR_32) @@ -1220,6 +1263,30 @@ vector_addrexcptn: W(b) vector_irq W(b) vector_fiq +#ifdef CONFIG_HARDEN_BRANCH_HISTORY + .section .vectors.bhb.loop8, "ax", %progbits +.L__vectors_bhb_loop8_start: + W(b) vector_rst + W(b) vector_bhb_loop8_und + W(ldr) pc, .L__vectors_bhb_loop8_start + 0x1004 + W(b) vector_bhb_loop8_pabt + W(b) vector_bhb_loop8_dabt + W(b) vector_addrexcptn + W(b) vector_bhb_loop8_irq + W(b) vector_bhb_loop8_fiq + + .section .vectors.bhb.bpiall, "ax", %progbits +.L__vectors_bhb_bpiall_start: + W(b) vector_rst + W(b) vector_bhb_bpiall_und + W(ldr) pc, .L__vectors_bhb_bpiall_start + 0x1008 + W(b) vector_bhb_bpiall_pabt + W(b) vector_bhb_bpiall_dabt + W(b) vector_addrexcptn + W(b) vector_bhb_bpiall_irq + W(b) vector_bhb_bpiall_fiq +#endif + .data .align 2 diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 0465d65d23de5786ef5df32738d0830ca492353a..e27fc2df523167cd6dbbeeb5c65bd7d23dd21402 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -165,6 +165,29 @@ ENDPROC(ret_from_fork) *----------------------------------------------------------------------------- */ + .align 5 +#ifdef CONFIG_HARDEN_BRANCH_HISTORY +ENTRY(vector_bhb_loop8_swi) + sub sp, sp, #PT_REGS_SIZE + stmia sp, {r0 - r12} + mov r8, #8 +1: b 2f +2: subs r8, r8, #1 + bne 1b + dsb + isb + b 3f +ENDPROC(vector_bhb_loop8_swi) + + .align 5 +ENTRY(vector_bhb_bpiall_swi) + sub sp, sp, #PT_REGS_SIZE + stmia sp, {r0 - r12} + mcr p15, 0, r8, c7, c5, 6 @ BPIALL + isb + b 3f +ENDPROC(vector_bhb_bpiall_swi) +#endif .align 5 ENTRY(vector_swi) #ifdef CONFIG_CPU_V7M @@ -172,6 +195,7 @@ ENTRY(vector_swi) #else sub sp, sp, #PT_REGS_SIZE stmia sp, {r0 - r12} @ Calling r0 - r12 +3: ARM( add r8, sp, #S_PC ) ARM( stmdb r8, {sp, lr}^ ) @ Calling sp, lr THUMB( mov r8, sp ) diff --git a/arch/arm/kernel/spectre.c b/arch/arm/kernel/spectre.c new file mode 100644 index 0000000000000000000000000000000000000000..0dcefc36fb7a08af113ef923af1fe9c826da662d --- /dev/null +++ b/arch/arm/kernel/spectre.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include + +#include + +static bool _unprivileged_ebpf_enabled(void) +{ +#ifdef CONFIG_BPF_SYSCALL + return !sysctl_unprivileged_bpf_disabled; +#else + return false; +#endif +} + +ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "Mitigation: __user pointer sanitization\n"); +} + +static unsigned int spectre_v2_state; +static unsigned int spectre_v2_methods; + +void spectre_v2_update_state(unsigned int state, unsigned int method) +{ + if (state > spectre_v2_state) + spectre_v2_state = state; + spectre_v2_methods |= method; +} + +ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, + char *buf) +{ + const char *method; + + if (spectre_v2_state == SPECTRE_UNAFFECTED) + return sprintf(buf, "%s\n", "Not affected"); + + if (spectre_v2_state != SPECTRE_MITIGATED) + return sprintf(buf, "%s\n", "Vulnerable"); + + if (_unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); + + switch (spectre_v2_methods) { + case SPECTRE_V2_METHOD_BPIALL: + method = "Branch predictor hardening"; + break; + + case SPECTRE_V2_METHOD_ICIALLU: + method = "I-cache invalidation"; + break; + + case SPECTRE_V2_METHOD_SMC: + case SPECTRE_V2_METHOD_HVC: + method = "Firmware call"; + break; + + case SPECTRE_V2_METHOD_LOOP8: + method = "History overwrite"; + break; + + default: + method = "Multiple mitigations"; + break; + } + + return sprintf(buf, "Mitigation: %s\n", method); +} diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 2a21239aec91ccab63bf1689847e431e48dec8a6..4c61ac4f3d3555b8a3765ad48e2a2845be838ee7 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -828,10 +829,59 @@ static inline void __init kuser_init(void *vectors) } #endif +#ifndef CONFIG_CPU_V7M +static void copy_from_lma(void *vma, void *lma_start, void *lma_end) +{ + memcpy(vma, lma_start, lma_end - lma_start); +} + +static void flush_vectors(void *vma, size_t offset, size_t size) +{ + unsigned long start = (unsigned long)vma + offset; + unsigned long end = start + size; + + flush_icache_range(start, end); +} + +#ifdef CONFIG_HARDEN_BRANCH_HISTORY +int spectre_bhb_update_vectors(unsigned int method) +{ + extern char __vectors_bhb_bpiall_start[], __vectors_bhb_bpiall_end[]; + extern char __vectors_bhb_loop8_start[], __vectors_bhb_loop8_end[]; + void *vec_start, *vec_end; + + if (system_state > SYSTEM_SCHEDULING) { + pr_err("CPU%u: Spectre BHB workaround too late - system vulnerable\n", + smp_processor_id()); + return SPECTRE_VULNERABLE; + } + + switch (method) { + case SPECTRE_V2_METHOD_LOOP8: + vec_start = __vectors_bhb_loop8_start; + vec_end = __vectors_bhb_loop8_end; + break; + + case SPECTRE_V2_METHOD_BPIALL: + vec_start = __vectors_bhb_bpiall_start; + vec_end = __vectors_bhb_bpiall_end; + break; + + default: + pr_err("CPU%u: unknown Spectre BHB state %d\n", + smp_processor_id(), method); + return SPECTRE_VULNERABLE; + } + + copy_from_lma(vectors_page, vec_start, vec_end); + flush_vectors(vectors_page, 0, vec_end - vec_start); + + return SPECTRE_MITIGATED; +} +#endif + void __init early_trap_init(void *vectors_base) { -#ifndef CONFIG_CPU_V7M - unsigned long vectors = (unsigned long)vectors_base; extern char __stubs_start[], __stubs_end[]; extern char __vectors_start[], __vectors_end[]; unsigned i; @@ -852,17 +902,20 @@ void __init early_trap_init(void *vectors_base) * into the vector page, mapped at 0xffff0000, and ensure these * are visible to the instruction stream. */ - memcpy((void *)vectors, __vectors_start, __vectors_end - __vectors_start); - memcpy((void *)vectors + 0x1000, __stubs_start, __stubs_end - __stubs_start); + copy_from_lma(vectors_base, __vectors_start, __vectors_end); + copy_from_lma(vectors_base + 0x1000, __stubs_start, __stubs_end); kuser_init(vectors_base); - flush_icache_range(vectors, vectors + PAGE_SIZE * 2); + flush_vectors(vectors_base, 0, PAGE_SIZE * 2); +} #else /* ifndef CONFIG_CPU_V7M */ +void __init early_trap_init(void *vectors_base) +{ /* * on V7-M there is no need to copy the vector table to a dedicated * memory area. The address is configurable and so a table in the kernel * image can be used. */ -#endif } +#endif diff --git a/arch/arm/kernel/vmlinux.lds.h b/arch/arm/kernel/vmlinux.lds.h index 8247bc15addc419d2b6f4bf222d4f37e172ca0a7..78d156e4f008847486fa052bb74f4372038aa3a6 100644 --- a/arch/arm/kernel/vmlinux.lds.h +++ b/arch/arm/kernel/vmlinux.lds.h @@ -25,6 +25,19 @@ #define ARM_MMU_DISCARD(x) x #endif +/* + * ld.lld does not support NOCROSSREFS: + * https://github.com/ClangBuiltLinux/linux/issues/1609 + */ +#ifdef CONFIG_LD_IS_LLD +#define NOCROSSREFS +#endif + +/* Set start/end symbol names to the LMA for the section */ +#define ARM_LMA(sym, section) \ + sym##_start = LOADADDR(section); \ + sym##_end = LOADADDR(section) + SIZEOF(section) + #define PROC_INFO \ . = ALIGN(4); \ __proc_info_begin = .; \ @@ -100,19 +113,31 @@ * only thing that matters is their relative offsets */ #define ARM_VECTORS \ - __vectors_start = .; \ - .vectors 0xffff0000 : AT(__vectors_start) { \ - *(.vectors) \ + __vectors_lma = .; \ + OVERLAY 0xffff0000 : NOCROSSREFS AT(__vectors_lma) { \ + .vectors { \ + *(.vectors) \ + } \ + .vectors.bhb.loop8 { \ + *(.vectors.bhb.loop8) \ + } \ + .vectors.bhb.bpiall { \ + *(.vectors.bhb.bpiall) \ + } \ } \ - . = __vectors_start + SIZEOF(.vectors); \ - __vectors_end = .; \ + ARM_LMA(__vectors, .vectors); \ + ARM_LMA(__vectors_bhb_loop8, .vectors.bhb.loop8); \ + ARM_LMA(__vectors_bhb_bpiall, .vectors.bhb.bpiall); \ + . = __vectors_lma + SIZEOF(.vectors) + \ + SIZEOF(.vectors.bhb.loop8) + \ + SIZEOF(.vectors.bhb.bpiall); \ \ - __stubs_start = .; \ - .stubs ADDR(.vectors) + 0x1000 : AT(__stubs_start) { \ + __stubs_lma = .; \ + .stubs ADDR(.vectors) + 0x1000 : AT(__stubs_lma) { \ *(.stubs) \ } \ - . = __stubs_start + SIZEOF(.stubs); \ - __stubs_end = .; \ + ARM_LMA(__stubs, .stubs); \ + . = __stubs_lma + SIZEOF(.stubs); \ \ PROVIDE(vector_fiq_offset = vector_fiq - ADDR(.vectors)); diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index b169e580bf8298026c8ae5791193adef3bbef342..096afa16b5317b87797e6fbf53b98e46f8f32609 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -823,6 +823,7 @@ config CPU_BPREDICT_DISABLE config CPU_SPECTRE bool + select GENERIC_CPU_VULNERABILITIES config HARDEN_BRANCH_PREDICTOR bool "Harden the branch predictor against aliasing attacks" if EXPERT @@ -843,6 +844,16 @@ config HARDEN_BRANCH_PREDICTOR If unsure, say Y. +config HARDEN_BRANCH_HISTORY + bool "Harden Spectre style attacks against branch history" if EXPERT + depends on CPU_SPECTRE + default y + help + Speculation attacks against some high-performance processors can + make use of branch history to influence future speculation. When + taking an exception, a sequence of branches overwrites the branch + history, or branch history is invalidated. + config TLS_REG_EMUL bool select NEED_KUSER_HELPERS diff --git a/arch/arm/mm/proc-v7-bugs.c b/arch/arm/mm/proc-v7-bugs.c index ac63a31347c106c55b9c1afb4e6e1b040d5056b1..87a93b76e148efc28c10790a3fd1998349368db1 100644 --- a/arch/arm/mm/proc-v7-bugs.c +++ b/arch/arm/mm/proc-v7-bugs.c @@ -7,8 +7,36 @@ #include #include #include +#include #include +#ifdef CONFIG_ARM_PSCI +#define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED 1 +static int __maybe_unused spectre_v2_get_cpu_fw_mitigation_state(void) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_1, &res); + + switch ((int)res.a0) { + case SMCCC_RET_SUCCESS: + return SPECTRE_MITIGATED; + + case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED: + return SPECTRE_UNAFFECTED; + + default: + return SPECTRE_VULNERABLE; + } +} +#else +static int __maybe_unused spectre_v2_get_cpu_fw_mitigation_state(void) +{ + return SPECTRE_VULNERABLE; +} +#endif + #ifdef CONFIG_HARDEN_BRANCH_PREDICTOR DEFINE_PER_CPU(harden_branch_predictor_fn_t, harden_branch_predictor_fn); @@ -37,13 +65,61 @@ static void __maybe_unused call_hvc_arch_workaround_1(void) arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); } -static void cpu_v7_spectre_init(void) +static unsigned int spectre_v2_install_workaround(unsigned int method) { const char *spectre_v2_method = NULL; int cpu = smp_processor_id(); if (per_cpu(harden_branch_predictor_fn, cpu)) - return; + return SPECTRE_MITIGATED; + + switch (method) { + case SPECTRE_V2_METHOD_BPIALL: + per_cpu(harden_branch_predictor_fn, cpu) = + harden_branch_predictor_bpiall; + spectre_v2_method = "BPIALL"; + break; + + case SPECTRE_V2_METHOD_ICIALLU: + per_cpu(harden_branch_predictor_fn, cpu) = + harden_branch_predictor_iciallu; + spectre_v2_method = "ICIALLU"; + break; + + case SPECTRE_V2_METHOD_HVC: + per_cpu(harden_branch_predictor_fn, cpu) = + call_hvc_arch_workaround_1; + cpu_do_switch_mm = cpu_v7_hvc_switch_mm; + spectre_v2_method = "hypervisor"; + break; + + case SPECTRE_V2_METHOD_SMC: + per_cpu(harden_branch_predictor_fn, cpu) = + call_smc_arch_workaround_1; + cpu_do_switch_mm = cpu_v7_smc_switch_mm; + spectre_v2_method = "firmware"; + break; + } + + if (spectre_v2_method) + pr_info("CPU%u: Spectre v2: using %s workaround\n", + smp_processor_id(), spectre_v2_method); + + return SPECTRE_MITIGATED; +} +#else +static unsigned int spectre_v2_install_workaround(unsigned int method) +{ + pr_info("CPU%u: Spectre V2: workarounds disabled by configuration\n", + smp_processor_id()); + + return SPECTRE_VULNERABLE; +} +#endif + +static void cpu_v7_spectre_v2_init(void) +{ + unsigned int state, method = 0; switch (read_cpuid_part()) { case ARM_CPU_PART_CORTEX_A8: @@ -52,32 +128,37 @@ static void cpu_v7_spectre_init(void) case ARM_CPU_PART_CORTEX_A17: case ARM_CPU_PART_CORTEX_A73: case ARM_CPU_PART_CORTEX_A75: - per_cpu(harden_branch_predictor_fn, cpu) = - harden_branch_predictor_bpiall; - spectre_v2_method = "BPIALL"; + state = SPECTRE_MITIGATED; + method = SPECTRE_V2_METHOD_BPIALL; break; case ARM_CPU_PART_CORTEX_A15: case ARM_CPU_PART_BRAHMA_B15: - per_cpu(harden_branch_predictor_fn, cpu) = - harden_branch_predictor_iciallu; - spectre_v2_method = "ICIALLU"; + state = SPECTRE_MITIGATED; + method = SPECTRE_V2_METHOD_ICIALLU; break; -#ifdef CONFIG_ARM_PSCI case ARM_CPU_PART_BRAHMA_B53: /* Requires no workaround */ + state = SPECTRE_UNAFFECTED; break; + default: /* Other ARM CPUs require no workaround */ - if (read_cpuid_implementor() == ARM_CPU_IMP_ARM) + if (read_cpuid_implementor() == ARM_CPU_IMP_ARM) { + state = SPECTRE_UNAFFECTED; break; + } /* fallthrough */ - /* Cortex A57/A72 require firmware workaround */ + /* Cortex A57/A72 require firmware workaround */ case ARM_CPU_PART_CORTEX_A57: case ARM_CPU_PART_CORTEX_A72: { struct arm_smccc_res res; + state = spectre_v2_get_cpu_fw_mitigation_state(); + if (state != SPECTRE_MITIGATED) + break; + if (psci_ops.smccc_version == SMCCC_VERSION_1_0) break; @@ -88,36 +169,101 @@ static void cpu_v7_spectre_init(void) switch (psci_ops.conduit) { case PSCI_CONDUIT_HVC: - per_cpu(harden_branch_predictor_fn, cpu) = - call_hvc_arch_workaround_1; - cpu_do_switch_mm = cpu_v7_hvc_switch_mm; - spectre_v2_method = "hypervisor"; + method = SPECTRE_V2_METHOD_HVC; break; case PSCI_CONDUIT_SMC: - per_cpu(harden_branch_predictor_fn, cpu) = - call_smc_arch_workaround_1; - cpu_do_switch_mm = cpu_v7_smc_switch_mm; - spectre_v2_method = "firmware"; + method = SPECTRE_V2_METHOD_SMC; break; default: + state = SPECTRE_VULNERABLE; break; } } -#endif } - if (spectre_v2_method) - pr_info("CPU%u: Spectre v2: using %s workaround\n", - smp_processor_id(), spectre_v2_method); + if (state == SPECTRE_MITIGATED) + state = spectre_v2_install_workaround(method); + + spectre_v2_update_state(state, method); +} + +#ifdef CONFIG_HARDEN_BRANCH_HISTORY +static int spectre_bhb_method; + +static const char *spectre_bhb_method_name(int method) +{ + switch (method) { + case SPECTRE_V2_METHOD_LOOP8: + return "loop"; + + case SPECTRE_V2_METHOD_BPIALL: + return "BPIALL"; + + default: + return "unknown"; + } +} + +static int spectre_bhb_install_workaround(int method) +{ + if (spectre_bhb_method != method) { + if (spectre_bhb_method) { + pr_err("CPU%u: Spectre BHB: method disagreement, system vulnerable\n", + smp_processor_id()); + + return SPECTRE_VULNERABLE; + } + + if (spectre_bhb_update_vectors(method) == SPECTRE_VULNERABLE) + return SPECTRE_VULNERABLE; + + spectre_bhb_method = method; + } + + pr_info("CPU%u: Spectre BHB: using %s workaround\n", + smp_processor_id(), spectre_bhb_method_name(method)); + + return SPECTRE_MITIGATED; } #else -static void cpu_v7_spectre_init(void) +static int spectre_bhb_install_workaround(int method) { + return SPECTRE_VULNERABLE; } #endif +static void cpu_v7_spectre_bhb_init(void) +{ + unsigned int state, method = 0; + + switch (read_cpuid_part()) { + case ARM_CPU_PART_CORTEX_A15: + case ARM_CPU_PART_BRAHMA_B15: + case ARM_CPU_PART_CORTEX_A57: + case ARM_CPU_PART_CORTEX_A72: + state = SPECTRE_MITIGATED; + method = SPECTRE_V2_METHOD_LOOP8; + break; + + case ARM_CPU_PART_CORTEX_A73: + case ARM_CPU_PART_CORTEX_A75: + state = SPECTRE_MITIGATED; + method = SPECTRE_V2_METHOD_BPIALL; + break; + + default: + state = SPECTRE_UNAFFECTED; + break; + } + + if (state == SPECTRE_MITIGATED) + state = spectre_bhb_install_workaround(method); + + spectre_v2_update_state(state, method); +} + static __maybe_unused bool cpu_v7_check_auxcr_set(bool *warned, u32 mask, const char *msg) { @@ -146,16 +292,17 @@ static bool check_spectre_auxcr(bool *warned, u32 bit) void cpu_v7_ca8_ibe(void) { if (check_spectre_auxcr(this_cpu_ptr(&spectre_warned), BIT(6))) - cpu_v7_spectre_init(); + cpu_v7_spectre_v2_init(); } void cpu_v7_ca15_ibe(void) { if (check_spectre_auxcr(this_cpu_ptr(&spectre_warned), BIT(0))) - cpu_v7_spectre_init(); + cpu_v7_spectre_v2_init(); } void cpu_v7_bugs_init(void) { - cpu_v7_spectre_init(); + cpu_v7_spectre_v2_init(); + cpu_v7_spectre_bhb_init(); } diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 02c2f528cb608abbc4115a3dbbcc96fd82afe3e0..80ab9c9dd43c099663d93dcfb8e969c1ae2926d9 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1079,6 +1079,15 @@ config ARM64_SSBD If unsure, say Y. +config MITIGATE_SPECTRE_BRANCH_HISTORY + bool "Mitigate Spectre style attacks against branch history" if EXPERT + default y + help + Speculation attacks against some high-performance processors can + make use of branch history to influence future speculation. + When taking an exception from user-space, a sequence of branches + or a firmware call overwrites the branch history. + menuconfig ARMV8_DEPRECATED bool "Emulate deprecated/obsolete ARMv8 instructions" depends on AARCH32_EL0 diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h index f364115ac2efecd3deb1dee7d426709201f89d90..348f12447ecd2c5a918e881fc6307dadf9624e50 100644 --- a/arch/arm64/include/asm/acpi.h +++ b/arch/arm64/include/asm/acpi.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -33,14 +34,14 @@ * is therefore used to delimit the MADT GICC structure minimum length * appropriately. */ -#define ACPI_MADT_GICC_MIN_LENGTH ACPI_OFFSET( \ +#define ACPI_MADT_GICC_MIN_LENGTH offsetof( \ struct acpi_madt_generic_interrupt, efficiency_class) #define BAD_MADT_GICC_ENTRY(entry, end) \ (!(entry) || (entry)->header.length < ACPI_MADT_GICC_MIN_LENGTH || \ (unsigned long)(entry) + (entry)->header.length > (end)) -#define ACPI_MADT_GICC_SPE (ACPI_OFFSET(struct acpi_madt_generic_interrupt, \ +#define ACPI_MADT_GICC_SPE (offsetof(struct acpi_madt_generic_interrupt, \ spe_interrupt) + sizeof(u16)) /* Basic configuration for ACPI */ diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index becb5454151e2ce346bc34b9095e38f6bbdcdaa7..21431950531303abe4002fe24f878f08597eae8b 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -118,6 +118,13 @@ hint #20 .endm +/* + * Clear Branch History instruction + */ + .macro clearbhb + hint #22 + .endm + /* * Sanitise a 64-bit bounded index wrt speculation, returning zero if out * of bounds. @@ -703,4 +710,31 @@ USER(\label, ic ivau, \tmp2) // invalidate I line PoU .Lyield_out_\@ : .endm + .macro __mitigate_spectre_bhb_loop tmp +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY +alternative_cb spectre_bhb_patch_loop_iter + mov \tmp, #32 // Patched to correct the immediate +alternative_cb_end +.Lspectre_bhb_loop\@: + b . + 4 + subs \tmp, \tmp, #1 + b.ne .Lspectre_bhb_loop\@ + dsb nsh + isb +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + .endm + + /* Save/restores x0-x3 to the stack */ + .macro __mitigate_spectre_bhb_fw +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + stp x0, x1, [sp, #-16]! + stp x2, x3, [sp, #-16]! + mov w0, #ARM_SMCCC_ARCH_WORKAROUND_3 +alternative_cb arm64_update_smccc_conduit + nop // Patched to SMC/HVC #0 +alternative_cb_end + ldp x2, x3, [sp], #16 + ldp x0, x1, [sp], #16 +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + .endm #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h index 88392272250e8ab3e263cd1cedefec542f87a70c..3a9908a0121904c522b6ce87f0438c22c5f731c9 100644 --- a/arch/arm64/include/asm/cpu.h +++ b/arch/arm64/include/asm/cpu.h @@ -36,6 +36,7 @@ struct cpuinfo_arm64 { u64 reg_id_aa64dfr1; u64 reg_id_aa64isar0; u64 reg_id_aa64isar1; + u64 reg_id_aa64isar2; u64 reg_id_aa64mmfr0; u64 reg_id_aa64mmfr1; u64 reg_id_aa64mmfr2; diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 69515f9077e348f75882b8d6f560f2dd24aa7ae5..f6d2ed5341a4c49cb4663d7793101785be073189 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -64,4 +64,6 @@ #define ARM64_NCAPS 38 #endif +#define ARM64_SPECTRE_BHB 40 + #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index c43d02365fb65075e23b8811a7fa6b5e18e665c9..bfb699957880d2d0e6681695065aa27a72363194 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -492,6 +492,34 @@ static inline bool cpu_supports_mixed_endian_el0(void) return id_aa64mmfr0_mixed_endian_el0(read_cpuid(ID_AA64MMFR0_EL1)); } +static inline bool supports_csv2p3(int scope) +{ + u64 pfr0; + u8 csv2_val; + + if (scope == SCOPE_LOCAL_CPU) + pfr0 = read_sysreg_s(SYS_ID_AA64PFR0_EL1); + else + pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + + csv2_val = cpuid_feature_extract_unsigned_field(pfr0, + ID_AA64PFR0_CSV2_SHIFT); + return csv2_val == 3; +} + +static inline bool supports_clearbhb(int scope) +{ + u64 isar2; + + if (scope == SCOPE_LOCAL_CPU) + isar2 = read_sysreg_s(SYS_ID_AA64ISAR2_EL1); + else + isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1); + + return cpuid_feature_extract_unsigned_field(isar2, + ID_AA64ISAR2_CLEARBHB_SHIFT); +} + static inline bool system_supports_32bit_el0(void) { return cpus_have_const_cap(ARM64_HAS_32BIT_EL0); @@ -549,6 +577,18 @@ static inline int arm64_get_ssbd_state(void) void arm64_set_ssbd_mitigation(bool state); +/* Watch out, ordering is important here. */ +enum mitigation_state { + SPECTRE_UNAFFECTED, + SPECTRE_MITIGATED, + SPECTRE_VULNERABLE, +}; + +enum mitigation_state arm64_get_spectre_bhb_state(void); +bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, int scope); +u8 spectre_bhb_loop_affected(int scope); +void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *__unused); + static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) { switch (parange) { diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index aa387961213004e700a49e4706f756bfd91b4861..852474568a9ea6f718326208e6a0fa475b6caf8b 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -81,6 +81,10 @@ #define ARM_CPU_PART_CORTEX_A35 0xD04 #define ARM_CPU_PART_CORTEX_A55 0xD05 #define ARM_CPU_PART_CORTEX_A76 0xD0B +#define ARM_CPU_PART_NEOVERSE_V1 0xD40 +#define ARM_CPU_PART_CORTEX_A78 0xD41 +#define ARM_CPU_PART_CORTEX_X1 0xD44 +#define ARM_CPU_PART_CORTEX_A78C 0xD4B #define APM_CPU_PART_POTENZA 0x000 @@ -115,6 +119,11 @@ #define MIDR_CORTEX_A35 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A35) #define MIDR_CORTEX_A55 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A55) #define MIDR_CORTEX_A76 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76) +#define MIDR_NEOVERSE_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V1) +#define MIDR_CORTEX_A78 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78) +#define MIDR_CORTEX_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1) +#define MIDR_CORTEX_A78C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78C) + #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h index ec1e6d6fa14ccd4c14a735a8732a7a468c709276..3c962ef081f840ef1dcf796f90aa73d65d863640 100644 --- a/arch/arm64/include/asm/fixmap.h +++ b/arch/arm64/include/asm/fixmap.h @@ -59,9 +59,11 @@ enum fixed_addresses { #endif /* CONFIG_ACPI_APEI_GHES */ #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + FIX_ENTRY_TRAMP_TEXT3, + FIX_ENTRY_TRAMP_TEXT2, + FIX_ENTRY_TRAMP_TEXT1, FIX_ENTRY_TRAMP_DATA, - FIX_ENTRY_TRAMP_TEXT, -#define TRAMP_VALIAS (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT)) +#define TRAMP_VALIAS (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT1)) #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ __end_of_permanent_fixed_addresses, diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 556351524748ff23fe0885b15563c500d6307072..bf03056e37512a4b90b461e43f5cb8c260a8e6f4 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -625,4 +625,9 @@ void kvm_arch_free_vm(struct kvm *kvm); int kvm_arm_config_vm(struct kvm *kvm, unsigned long type); +static inline enum mitigation_state kvm_arm_get_spectre_bhb_state(void) +{ + return arm64_get_spectre_bhb_state(); +} + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 399228f9f25436438c2aa7e9690af412ddd70445..f8f00c054776983f8c6811a75ffaef54e3bf588f 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -488,7 +488,8 @@ static inline void *kvm_get_hyp_vector(void) void *vect = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector)); int slot = -1; - if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR) && data->fn) { + if ((cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR) || + cpus_have_const_cap(ARM64_SPECTRE_BHB)) && data->template_start) { vect = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs_start)); slot = data->hyp_vectors_slot; } @@ -517,7 +518,8 @@ static inline int kvm_map_vectors(void) * !HBP + HEL2 -> allocate one vector slot and use exec mapping * HBP + HEL2 -> use hardened vertors and use exec mapping */ - if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR)) { + if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR) || + cpus_have_const_cap(ARM64_SPECTRE_BHB)) { __kvm_bp_vect_base = kvm_ksym_ref(__bp_harden_hyp_vecs_start); __kvm_bp_vect_base = kern_hyp_va(__kvm_bp_vect_base); } diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 3e02efc9cdd0c53165d78b47d6fa4e4bbd071e37..fac1a43f4ac36ac20ec25fd04a80332987643153 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -50,7 +50,7 @@ extern int res_mem_count; */ #define ASID(mm) ((mm)->context.id.counter & 0xffff) -static inline bool arm64_kernel_unmapped_at_el0(void) +static __always_inline bool arm64_kernel_unmapped_at_el0(void) { return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && cpus_have_const_cap(ARM64_UNMAP_KERNEL_AT_EL0); @@ -61,6 +61,12 @@ typedef void (*bp_hardening_cb_t)(void); struct bp_hardening_data { int hyp_vectors_slot; bp_hardening_cb_t fn; + + /* + * template_start is only used by the BHB mitigation to identify the + * hyp_vectors_slot sequence. + */ + const char *template_start; }; #if (defined(CONFIG_HARDEN_BRANCH_PREDICTOR) || \ diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h index caab039d63055ad7180b6a029cd80d8c572c92eb..8d3f1eab58e049bdc39c62fac562bb65f09fcdd9 100644 --- a/arch/arm64/include/asm/sections.h +++ b/arch/arm64/include/asm/sections.h @@ -30,4 +30,9 @@ extern char __irqentry_text_start[], __irqentry_text_end[]; extern char __mmuoff_data_start[], __mmuoff_data_end[]; extern char __entry_tramp_text_start[], __entry_tramp_text_end[]; +static inline size_t entry_tramp_text_size(void) +{ + return __entry_tramp_text_end - __entry_tramp_text_start; +} + #endif /* __ASM_SECTIONS_H */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index db16c4844de682a91caae0ac6e2e18fac3b401c1..0fd51d253648db5eb190c80b7d78fc723ec4f86a 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -159,6 +159,7 @@ #define SYS_ID_AA64ISAR0_EL1 sys_reg(3, 0, 0, 6, 0) #define SYS_ID_AA64ISAR1_EL1 sys_reg(3, 0, 0, 6, 1) +#define SYS_ID_AA64ISAR2_EL1 sys_reg(3, 0, 0, 6, 2) #define SYS_ID_AA64MMFR0_EL1 sys_reg(3, 0, 0, 7, 0) #define SYS_ID_AA64MMFR1_EL1 sys_reg(3, 0, 0, 7, 1) @@ -527,6 +528,9 @@ #define ID_AA64ISAR1_JSCVT_SHIFT 12 #define ID_AA64ISAR1_DPB_SHIFT 0 +/* id_aa64isar2 */ +#define ID_AA64ISAR2_CLEARBHB_SHIFT 28 + /* id_aa64pfr0 */ #define ID_AA64PFR0_CSV3_SHIFT 60 #define ID_AA64PFR0_CSV2_SHIFT 56 @@ -586,6 +590,7 @@ #endif /* id_aa64mmfr1 */ +#define ID_AA64MMFR1_ECBHB_SHIFT 60 #define ID_AA64MMFR1_PAN_SHIFT 20 #define ID_AA64MMFR1_LOR_SHIFT 16 #define ID_AA64MMFR1_HPD_SHIFT 12 diff --git a/arch/arm64/include/asm/vectors.h b/arch/arm64/include/asm/vectors.h new file mode 100644 index 0000000000000000000000000000000000000000..695583b9a145b79b539295db8cea221da8ea47a4 --- /dev/null +++ b/arch/arm64/include/asm/vectors.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2022 ARM Ltd. + */ +#ifndef __ASM_VECTORS_H +#define __ASM_VECTORS_H + +#include +#include + +#include +#include + +extern char vectors[]; +extern char tramp_vectors[]; +extern char __bp_harden_el1_vectors[]; + +/* + * Note: the order of this enum corresponds to two arrays in entry.S: + * tramp_vecs and __bp_harden_el1_vectors. By default the canonical + * 'full fat' vectors are used directly. + */ +enum arm64_bp_harden_el1_vectors { +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + /* + * Perform the BHB loop mitigation, before branching to the canonical + * vectors. + */ + EL1_VECTOR_BHB_LOOP, + + /* + * Make the SMC call for firmware mitigation, before branching to the + * canonical vectors. + */ + EL1_VECTOR_BHB_FW, + + /* + * Use the ClearBHB instruction, before branching to the canonical + * vectors. + */ + EL1_VECTOR_BHB_CLEAR_INSN, +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + + /* + * Remap the kernel before branching to the canonical vectors. + */ + EL1_VECTOR_KPTI, +}; + +#ifndef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY +#define EL1_VECTOR_BHB_LOOP -1 +#define EL1_VECTOR_BHB_FW -1 +#define EL1_VECTOR_BHB_CLEAR_INSN -1 +#endif /* !CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + +/* The vectors to use on return from EL0. e.g. to remap the kernel */ +DECLARE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector); + +#ifndef CONFIG_UNMAP_KERNEL_AT_EL0 +#define TRAMP_VALIAS 0 +#endif + +static inline const char * +arm64_get_bp_hardening_vector(enum arm64_bp_harden_el1_vectors slot) +{ + if (arm64_kernel_unmapped_at_el0()) + return (char *)TRAMP_VALIAS + SZ_2K * slot; + + WARN_ON_ONCE(slot == EL1_VECTOR_KPTI); + + return __bp_harden_el1_vectors + SZ_2K * slot; +} + +#endif /* __ASM_VECTORS_H */ diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index a6f2451b446906b617a67b1180bf07822c91d9fb..fdff9c6d896932529e60852c3f84036cfe82418d 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -24,6 +24,7 @@ #include #include #include +#include #ifdef CONFIG_HISILICON_ERRATUM_HIP08_RU_PREFETCH #include #include @@ -147,6 +148,16 @@ DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); #ifdef CONFIG_KVM_INDIRECT_VECTORS extern char __smccc_workaround_1_smc_start[]; extern char __smccc_workaround_1_smc_end[]; +extern char __smccc_workaround_3_smc_start[]; +extern char __smccc_workaround_3_smc_end[]; +extern char __spectre_bhb_loop_k8_start[]; +extern char __spectre_bhb_loop_k8_end[]; +extern char __spectre_bhb_loop_k24_start[]; +extern char __spectre_bhb_loop_k24_end[]; +extern char __spectre_bhb_loop_k32_start[]; +extern char __spectre_bhb_loop_k32_end[]; +extern char __spectre_bhb_clearbhb_start[]; +extern char __spectre_bhb_clearbhb_end[]; static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, const char *hyp_vecs_end) @@ -160,11 +171,11 @@ static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, __flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K); } +static DEFINE_SPINLOCK(bp_lock); static void install_bp_hardening_cb(bp_hardening_cb_t fn, const char *hyp_vecs_start, const char *hyp_vecs_end) { - static DEFINE_SPINLOCK(bp_lock); int cpu, slot = -1; spin_lock(&bp_lock); @@ -183,6 +194,7 @@ static void install_bp_hardening_cb(bp_hardening_cb_t fn, __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot); __this_cpu_write(bp_hardening_data.fn, fn); + __this_cpu_write(bp_hardening_data.template_start, hyp_vecs_start); spin_unlock(&bp_lock); } #else @@ -931,6 +943,13 @@ const struct arm64_cpu_capabilities arm64_errata[] = { .matches = has_ssbd_mitigation, .midr_range_list = arm64_ssb_cpus, }, + { + .desc = "Spectre-BHB", + .capability = ARM64_SPECTRE_BHB, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = is_spectre_bhb_affected, + .cpu_enable = spectre_bhb_enable_mitigation, + }, #ifdef CONFIG_ARM64_ERRATUM_1463225 { .desc = "ARM erratum 1463225", @@ -957,14 +976,39 @@ ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, return sprintf(buf, "Mitigation: __user pointer sanitization\n"); } +static const char *get_bhb_affected_string(enum mitigation_state bhb_state) +{ + switch (bhb_state) { + case SPECTRE_UNAFFECTED: + return ""; + default: + case SPECTRE_VULNERABLE: + return ", but not BHB"; + case SPECTRE_MITIGATED: + return ", BHB"; + } +} + ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) { - if (__spectrev2_safe) - return sprintf(buf, "Not affected\n"); + enum mitigation_state bhb_state = arm64_get_spectre_bhb_state(); + const char *bhb_str = get_bhb_affected_string(bhb_state); + const char *v2_str = "Branch predictor hardening"; + + if (__spectrev2_safe) { + if (bhb_state == SPECTRE_UNAFFECTED) + return sprintf(buf, "Not affected\n"); + + /* + * Platforms affected by Spectre-BHB can't report + * "Not affected" for Spectre-v2. + */ + v2_str = "CSV2"; + } if (__hardenbp_enab) - return sprintf(buf, "Mitigation: Branch predictor hardening\n"); + return sprintf(buf, "Mitigation: %s%s\n", v2_str, bhb_str); return sprintf(buf, "Vulnerable\n"); } @@ -985,3 +1029,327 @@ ssize_t cpu_show_spec_store_bypass(struct device *dev, return sprintf(buf, "Vulnerable\n"); } + +/* + * We try to ensure that the mitigation state can never change as the result of + * onlining a late CPU. + */ +static void update_mitigation_state(enum mitigation_state *oldp, + enum mitigation_state new) +{ + enum mitigation_state state; + + do { + state = READ_ONCE(*oldp); + if (new <= state) + break; + } while (cmpxchg_relaxed(oldp, state, new) != state); +} + +/* + * Spectre BHB. + * + * A CPU is either: + * - Mitigated by a branchy loop a CPU specific number of times, and listed + * in our "loop mitigated list". + * - Mitigated in software by the firmware Spectre v2 call. + * - Has the ClearBHB instruction to perform the mitigation. + * - Has the 'Exception Clears Branch History Buffer' (ECBHB) feature, so no + * software mitigation in the vectors is needed. + * - Has CSV2.3, so is unaffected. + */ +static enum mitigation_state spectre_bhb_state; + +enum mitigation_state arm64_get_spectre_bhb_state(void) +{ + return spectre_bhb_state; +} + +/* + * This must be called with SCOPE_LOCAL_CPU for each type of CPU, before any + * SCOPE_SYSTEM call will give the right answer. + */ +u8 spectre_bhb_loop_affected(int scope) +{ + u8 k = 0; + static u8 max_bhb_k; + + if (scope == SCOPE_LOCAL_CPU) { + static const struct midr_range spectre_bhb_k32_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), + {}, + }; + static const struct midr_range spectre_bhb_k24_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), + {}, + }; + static const struct midr_range spectre_bhb_k8_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + {}, + }; + + if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list)) + k = 32; + else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list)) + k = 24; + else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k8_list)) + k = 8; + + max_bhb_k = max(max_bhb_k, k); + } else { + k = max_bhb_k; + } + + return k; +} + +static enum mitigation_state spectre_bhb_get_cpu_fw_mitigation_state(void) +{ + int ret; + struct arm_smccc_res res; + + if (psci_ops.smccc_version == SMCCC_VERSION_1_0) + return SPECTRE_VULNERABLE; + + switch (psci_ops.conduit) { + case PSCI_CONDUIT_HVC: + arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_3, &res); + break; + + case PSCI_CONDUIT_SMC: + arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_3, &res); + break; + + default: + return SPECTRE_VULNERABLE; + } + + ret = res.a0; + switch (ret) { + case SMCCC_RET_SUCCESS: + return SPECTRE_MITIGATED; + case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED: + return SPECTRE_UNAFFECTED; + default: + case SMCCC_RET_NOT_SUPPORTED: + return SPECTRE_VULNERABLE; + } +} + +static bool is_spectre_bhb_fw_affected(int scope) +{ + static bool system_affected; + enum mitigation_state fw_state; + bool has_smccc = (psci_ops.smccc_version >= SMCCC_VERSION_1_1); + static const struct midr_range spectre_bhb_firmware_mitigated_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), + {}, + }; + bool cpu_in_list = is_midr_in_range_list(read_cpuid_id(), + spectre_bhb_firmware_mitigated_list); + + if (scope != SCOPE_LOCAL_CPU) + return system_affected; + + fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); + if (cpu_in_list || (has_smccc && fw_state == SPECTRE_MITIGATED)) { + system_affected = true; + return true; + } + + return false; +} + +static bool supports_ecbhb(int scope) +{ + u64 mmfr1; + + if (scope == SCOPE_LOCAL_CPU) + mmfr1 = read_sysreg_s(SYS_ID_AA64MMFR1_EL1); + else + mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + + return cpuid_feature_extract_unsigned_field(mmfr1, + ID_AA64MMFR1_ECBHB_SHIFT); +} + +bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, + int scope) +{ + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + + if (supports_csv2p3(scope)) + return false; + + if (supports_clearbhb(scope)) + return true; + + if (spectre_bhb_loop_affected(scope)) + return true; + + if (is_spectre_bhb_fw_affected(scope)) + return true; + + return false; +} + +static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot) +{ + const char *v = arm64_get_bp_hardening_vector(slot); + + if (slot < 0) + return; + + __this_cpu_write(this_cpu_vector, v); + + /* + * When KPTI is in use, the vectors are switched when exiting to + * user-space. + */ + if (arm64_kernel_unmapped_at_el0()) + return; + + write_sysreg(v, vbar_el1); + isb(); +} + +#ifdef CONFIG_KVM_INDIRECT_VECTORS +static const char *kvm_bhb_get_vecs_end(const char *start) +{ + if (start == __smccc_workaround_3_smc_start) + return __smccc_workaround_3_smc_end; + else if (start == __spectre_bhb_loop_k8_start) + return __spectre_bhb_loop_k8_end; + else if (start == __spectre_bhb_loop_k24_start) + return __spectre_bhb_loop_k24_end; + else if (start == __spectre_bhb_loop_k32_start) + return __spectre_bhb_loop_k32_end; + else if (start == __spectre_bhb_clearbhb_start) + return __spectre_bhb_clearbhb_end; + + return NULL; +} + +static void kvm_setup_bhb_slot(const char *hyp_vecs_start) +{ + int cpu, slot = -1; + const char *hyp_vecs_end; + + if (!IS_ENABLED(CONFIG_KVM) || !is_hyp_mode_available()) + return; + + hyp_vecs_end = kvm_bhb_get_vecs_end(hyp_vecs_start); + if (WARN_ON_ONCE(!hyp_vecs_start || !hyp_vecs_end)) + return; + + spin_lock(&bp_lock); + for_each_possible_cpu(cpu) { + if (per_cpu(bp_hardening_data.template_start, cpu) == hyp_vecs_start) { + slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu); + break; + } + } + + if (slot == -1) { + slot = atomic_inc_return(&arm64_el2_vector_last_slot); + BUG_ON(slot >= BP_HARDEN_EL2_SLOTS); + __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end); + } + + __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot); + __this_cpu_write(bp_hardening_data.template_start, hyp_vecs_start); + spin_unlock(&bp_lock); +} +#else +#define __smccc_workaround_3_smc_start NULL +#define __spectre_bhb_loop_k8_start NULL +#define __spectre_bhb_loop_k24_start NULL +#define __spectre_bhb_loop_k32_start NULL +#define __spectre_bhb_clearbhb_start NULL + +static void kvm_setup_bhb_slot(const char *hyp_vecs_start) { }; +#endif + +void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) +{ + enum mitigation_state fw_state, state = SPECTRE_VULNERABLE; + + if (!is_spectre_bhb_affected(entry, SCOPE_LOCAL_CPU)) + return; + + if (!__spectrev2_safe && !__hardenbp_enab) { + /* No point mitigating Spectre-BHB alone. */ + } else if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) { + pr_info_once("spectre-bhb mitigation disabled by compile time option\n"); + } else if (cpu_mitigations_off()) { + pr_info_once("spectre-bhb mitigation disabled by command line option\n"); + } else if (supports_ecbhb(SCOPE_LOCAL_CPU)) { + state = SPECTRE_MITIGATED; + } else if (supports_clearbhb(SCOPE_LOCAL_CPU)) { + kvm_setup_bhb_slot(__spectre_bhb_clearbhb_start); + this_cpu_set_vectors(EL1_VECTOR_BHB_CLEAR_INSN); + + state = SPECTRE_MITIGATED; + } else if (spectre_bhb_loop_affected(SCOPE_LOCAL_CPU)) { + switch (spectre_bhb_loop_affected(SCOPE_SYSTEM)) { + case 8: + kvm_setup_bhb_slot(__spectre_bhb_loop_k8_start); + break; + case 24: + kvm_setup_bhb_slot(__spectre_bhb_loop_k24_start); + break; + case 32: + kvm_setup_bhb_slot(__spectre_bhb_loop_k32_start); + break; + default: + WARN_ON_ONCE(1); + } + this_cpu_set_vectors(EL1_VECTOR_BHB_LOOP); + + state = SPECTRE_MITIGATED; + } else if (is_spectre_bhb_fw_affected(SCOPE_LOCAL_CPU)) { + fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); + if (fw_state == SPECTRE_MITIGATED) { + kvm_setup_bhb_slot(__smccc_workaround_3_smc_start); + this_cpu_set_vectors(EL1_VECTOR_BHB_FW); + + /* + * With WA3 in the vectors, the WA1 calls can be + * removed. + */ + __this_cpu_write(bp_hardening_data.fn, NULL); + + state = SPECTRE_MITIGATED; + } + } + + update_mitigation_state(&spectre_bhb_state, state); +} + +/* Patched to correct the immediate */ +void __init spectre_bhb_patch_loop_iter(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + u8 rd; + u32 insn; + u16 loop_count = spectre_bhb_loop_affected(SCOPE_SYSTEM); + + BUG_ON(nr_inst != 1); /* MOV -> MOV */ + + if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) + return; + + insn = le32_to_cpu(*origptr); + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn); + insn = aarch64_insn_gen_movewide(rd, loop_count, 0, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_ZERO); + *updptr++ = cpu_to_le32(insn); +} diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 1bf9d84265de29aff86696704673a7b0f87299d4..aa9178e9acc6190020d5638f9221a42869d04c56 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -20,11 +20,13 @@ #include #include +#include #include #include #include #include #include + #include #include #include @@ -33,6 +35,7 @@ #include #include #include +#include #include unsigned long elf_hwcap __read_mostly; @@ -54,6 +57,8 @@ EXPORT_SYMBOL(cpu_hwcaps); /* Need also bit for ARM64_CB_PATCH */ DECLARE_BITMAP(boot_capabilities, ARM64_NPATCHABLE); +DEFINE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector) = vectors; + /* * Flag to indicate if we have computed the system wide * capabilities based on the boot time active CPUs. This @@ -148,6 +153,11 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64ISAR2_CLEARBHB_SHIFT, 4, 0), + ARM64_FTR_END, +}; + static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV2_SHIFT, 4, 0), @@ -397,6 +407,7 @@ static const struct __ftr_reg_entry { /* Op1 = 0, CRn = 0, CRm = 6 */ ARM64_FTR_REG(SYS_ID_AA64ISAR0_EL1, ftr_id_aa64isar0), ARM64_FTR_REG(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1), + ARM64_FTR_REG(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2), /* Op1 = 0, CRn = 0, CRm = 7 */ ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0), @@ -545,6 +556,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) init_cpu_ftr_reg(SYS_ID_AA64DFR1_EL1, info->reg_id_aa64dfr1); init_cpu_ftr_reg(SYS_ID_AA64ISAR0_EL1, info->reg_id_aa64isar0); init_cpu_ftr_reg(SYS_ID_AA64ISAR1_EL1, info->reg_id_aa64isar1); + init_cpu_ftr_reg(SYS_ID_AA64ISAR2_EL1, info->reg_id_aa64isar2); init_cpu_ftr_reg(SYS_ID_AA64MMFR0_EL1, info->reg_id_aa64mmfr0); init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1); init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2); @@ -662,6 +674,8 @@ void update_cpu_features(int cpu, info->reg_id_aa64isar0, boot->reg_id_aa64isar0); taint |= check_update_ftr_reg(SYS_ID_AA64ISAR1_EL1, cpu, info->reg_id_aa64isar1, boot->reg_id_aa64isar1); + taint |= check_update_ftr_reg(SYS_ID_AA64ISAR2_EL1, cpu, + info->reg_id_aa64isar2, boot->reg_id_aa64isar2); /* * Differing PARange support is fine as long as all peripherals and @@ -811,6 +825,7 @@ static u64 __read_sysreg_by_encoding(u32 sys_id) read_sysreg_case(SYS_ID_AA64MMFR2_EL1); read_sysreg_case(SYS_ID_AA64ISAR0_EL1); read_sysreg_case(SYS_ID_AA64ISAR1_EL1); + read_sysreg_case(SYS_ID_AA64ISAR2_EL1); read_sysreg_case(SYS_CNTFRQ_EL0); read_sysreg_case(SYS_CTR_EL0); @@ -1038,6 +1053,12 @@ kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused) static bool kpti_applied = false; int cpu = smp_processor_id(); + if (__this_cpu_read(this_cpu_vector) == vectors) { + const char *v = arm64_get_bp_hardening_vector(EL1_VECTOR_KPTI); + + __this_cpu_write(this_cpu_vector, v); + } + if (kpti_applied) return; diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index b79589ab30700dc13570bc67e73b79184d609066..005d88db1082ab8080dd7136c991e3ac382de48d 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -342,6 +342,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_id_aa64dfr1 = read_cpuid(ID_AA64DFR1_EL1); info->reg_id_aa64isar0 = read_cpuid(ID_AA64ISAR0_EL1); info->reg_id_aa64isar1 = read_cpuid(ID_AA64ISAR1_EL1); + info->reg_id_aa64isar2 = read_cpuid(ID_AA64ISAR2_EL1); info->reg_id_aa64mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); info->reg_id_aa64mmfr2 = read_cpuid(ID_AA64MMFR2_EL1); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 9a044d4255029205eca8bbec30a7292c6ecf5f86..7c231eb211a427ff321a502653dfb6893b98ce9a 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -70,18 +70,21 @@ .macro kernel_ventry, el, label, regsize = 64 .align 7 -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -alternative_if ARM64_UNMAP_KERNEL_AT_EL0 +.Lventry_start\@: .if \el == 0 + /* + * This must be the first instruction of the EL0 vector entries. It is + * skipped by the trampoline vectors, to trigger the cleanup. + */ + b .Lskip_tramp_vectors_cleanup\@ .if \regsize == 64 mrs x30, tpidrro_el0 msr tpidrro_el0, xzr .else mov x30, xzr .endif +.Lskip_tramp_vectors_cleanup\@: .endif -alternative_else_nop_endif -#endif sub sp, sp, #S_FRAME_SIZE #ifdef CONFIG_VMAP_STACK @@ -127,11 +130,15 @@ alternative_else_nop_endif mrs x0, tpidrro_el0 #endif b el\()\el\()_\label +.org .Lventry_start\@ + 128 // Did we overflow the ventry slot? .endm - .macro tramp_alias, dst, sym + .macro tramp_alias, dst, sym, tmp mov_q \dst, TRAMP_VALIAS - add \dst, \dst, #(\sym - .entry.tramp.text) + adr_l \tmp, \sym + add \dst, \dst, \tmp + adr_l \tmp, .entry.tramp.text + sub \dst, \dst, \tmp .endm // This macro corrupts x0-x3. It is the caller's duty @@ -359,25 +366,29 @@ alternative_else_nop_endif ldp x24, x25, [sp, #16 * 12] ldp x26, x27, [sp, #16 * 13] ldp x28, x29, [sp, #16 * 14] - ldr lr, [sp, #S_LR] - add sp, sp, #S_FRAME_SIZE // restore sp /* * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on eret context synchronization * when returning from IPI handler, and when returning to user-space. */ .if \el == 0 -alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 +alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 + ldr lr, [sp, #S_LR] + add sp, sp, #S_FRAME_SIZE // restore sp + eret +alternative_else_nop_endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 bne 4f - msr far_el1, x30 - tramp_alias x30, tramp_exit_native + msr far_el1, x29 + tramp_alias x30, tramp_exit_native, x29 br x30 4: - tramp_alias x30, tramp_exit_compat + tramp_alias x30, tramp_exit_compat, x29 br x30 #endif .else + ldr lr, [sp, #S_LR] + add sp, sp, #S_FRAME_SIZE // restore sp eret .endif .endm @@ -1020,12 +1031,7 @@ ENDPROC(el0_svc) .popsection // .entry.text -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -/* - * Exception vectors trampoline. - */ - .pushsection ".entry.tramp.text", "ax" - + // Move from tramp_pg_dir to swapper_pg_dir .macro tramp_map_kernel, tmp mrs \tmp, ttbr1_el1 add \tmp, \tmp, #(PAGE_SIZE + RESERVED_TTBR0_SIZE) @@ -1057,12 +1063,47 @@ alternative_else_nop_endif */ .endm - .macro tramp_ventry, regsize = 64 + .macro tramp_data_page dst + adr_l \dst, .entry.tramp.text + sub \dst, \dst, PAGE_SIZE + .endm + + .macro tramp_data_read_var dst, var +#ifdef CONFIG_RANDOMIZE_BASE + tramp_data_page \dst + add \dst, \dst, #:lo12:__entry_tramp_data_\var + ldr \dst, [\dst] +#else + ldr \dst, =\var +#endif + .endm + +#define BHB_MITIGATION_NONE 0 +#define BHB_MITIGATION_LOOP 1 +#define BHB_MITIGATION_FW 2 +#define BHB_MITIGATION_INSN 3 + + .macro tramp_ventry, vector_start, regsize, kpti, bhb .align 7 1: .if \regsize == 64 msr tpidrro_el0, x30 // Restored in kernel_ventry .endif + + .if \bhb == BHB_MITIGATION_LOOP + /* + * This sequence must appear before the first indirect branch. i.e. the + * ret out of tramp_ventry. It appears here because x30 is free. + */ + __mitigate_spectre_bhb_loop x30 + .endif // \bhb == BHB_MITIGATION_LOOP + + .if \bhb == BHB_MITIGATION_INSN + clearbhb + isb + .endif // \bhb == BHB_MITIGATION_INSN + + .if \kpti == 1 /* * Defend against branch aliasing attacks by pushing a dummy * entry onto the return stack and using a RET instruction to @@ -1072,43 +1113,75 @@ alternative_else_nop_endif b . 2: tramp_map_kernel x30 -#ifdef CONFIG_RANDOMIZE_BASE - adr x30, tramp_vectors + PAGE_SIZE alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 - ldr x30, [x30] -#else - ldr x30, =vectors -#endif - prfm plil1strm, [x30, #(1b - tramp_vectors)] + tramp_data_read_var x30, vectors + prfm plil1strm, [x30, #(1b - \vector_start)] msr vbar_el1, x30 - add x30, x30, #(1b - tramp_vectors) isb + .else + ldr x30, =vectors + .endif // \kpti == 1 + + .if \bhb == BHB_MITIGATION_FW + /* + * The firmware sequence must appear before the first indirect branch. + * i.e. the ret out of tramp_ventry. But it also needs the stack to be + * mapped to save/restore the registers the SMC clobbers. + */ + __mitigate_spectre_bhb_fw + .endif // \bhb == BHB_MITIGATION_FW + + add x30, x30, #(1b - \vector_start + 4) ret +.org 1b + 128 // Did we overflow the ventry slot? .endm .macro tramp_exit, regsize = 64 - adr x30, tramp_vectors + tramp_data_read_var x30, this_cpu_vector +alternative_if_not ARM64_HAS_VIRT_HOST_EXTN + mrs x29, tpidr_el1 +alternative_else + mrs x29, tpidr_el2 +alternative_endif + ldr x30, [x30, x29] + msr vbar_el1, x30 - tramp_unmap_kernel x30 + ldr lr, [sp, #S_LR] + tramp_unmap_kernel x29 .if \regsize == 64 - mrs x30, far_el1 + mrs x29, far_el1 .endif + add sp, sp, #S_FRAME_SIZE // restore sp eret .endm - .align 11 -ENTRY(tramp_vectors) + .macro generate_tramp_vector, kpti, bhb +.Lvector_start\@: .space 0x400 - tramp_ventry - tramp_ventry - tramp_ventry - tramp_ventry + .rept 4 + tramp_ventry .Lvector_start\@, 64, \kpti, \bhb + .endr + .rept 4 + tramp_ventry .Lvector_start\@, 32, \kpti, \bhb + .endr + .endm - tramp_ventry 32 - tramp_ventry 32 - tramp_ventry 32 - tramp_ventry 32 +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +/* + * Exception vectors trampoline. + * The order must match __bp_harden_el1_vectors and the + * arm64_bp_harden_el1_vectors enum. + */ + .pushsection ".entry.tramp.text", "ax" + .align 11 +ENTRY(tramp_vectors) +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_LOOP + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_FW + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_INSN +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_NONE END(tramp_vectors) ENTRY(tramp_exit_native) @@ -1126,11 +1199,55 @@ END(tramp_exit_compat) .align PAGE_SHIFT .globl __entry_tramp_data_start __entry_tramp_data_start: +__entry_tramp_data_vectors: .quad vectors +#ifdef CONFIG_ARM_SDE_INTERFACE +__entry_tramp_data___sdei_asm_handler: + .quad __sdei_asm_handler +#endif /* CONFIG_ARM_SDE_INTERFACE */ +__entry_tramp_data_this_cpu_vector: + .quad this_cpu_vector .popsection // .rodata #endif /* CONFIG_RANDOMIZE_BASE */ #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ +/* + * Exception vectors for spectre mitigations on entry from EL1 when + * kpti is not in use. + */ + .macro generate_el1_vector, bhb +.Lvector_start\@: + kernel_ventry 1, sync_invalid // Synchronous EL1t + kernel_ventry 1, irq_invalid // IRQ EL1t + kernel_ventry 1, fiq_invalid // FIQ EL1t + kernel_ventry 1, error_invalid // Error EL1t + + kernel_ventry 1, sync // Synchronous EL1h + kernel_ventry 1, irq // IRQ EL1h + kernel_ventry 1, fiq_invalid // FIQ EL1h + kernel_ventry 1, error // Error EL1h + + .rept 4 + tramp_ventry .Lvector_start\@, 64, 0, \bhb + .endr + .rept 4 + tramp_ventry .Lvector_start\@, 32, 0, \bhb + .endr + .endm + +/* The order must match tramp_vecs and the arm64_bp_harden_el1_vectors enum. */ + .pushsection ".entry.text", "ax" + .align 11 +ENTRY(__bp_harden_el1_vectors) +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + generate_el1_vector bhb=BHB_MITIGATION_LOOP + generate_el1_vector bhb=BHB_MITIGATION_FW + generate_el1_vector bhb=BHB_MITIGATION_INSN +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ +END(__bp_harden_el1_vectors) + .popsection + + /* * Register switch for AArch64. The callee-saved registers need to be saved * and restored. On entry: @@ -1217,13 +1334,7 @@ ENTRY(__sdei_asm_entry_trampoline) */ 1: str x4, [x1, #(SDEI_EVENT_INTREGS + S_ORIG_ADDR_LIMIT)] -#ifdef CONFIG_RANDOMIZE_BASE - adr x4, tramp_vectors + PAGE_SIZE - add x4, x4, #:lo12:__sdei_asm_trampoline_next_handler - ldr x4, [x4] -#else - ldr x4, =__sdei_asm_handler -#endif + tramp_data_read_var x4, __sdei_asm_handler br x4 ENDPROC(__sdei_asm_entry_trampoline) NOKPROBE(__sdei_asm_entry_trampoline) @@ -1246,12 +1357,6 @@ ENDPROC(__sdei_asm_exit_trampoline) NOKPROBE(__sdei_asm_exit_trampoline) .ltorg .popsection // .entry.tramp.text -#ifdef CONFIG_RANDOMIZE_BASE -.pushsection ".rodata", "a" -__sdei_asm_trampoline_next_handler: - .quad __sdei_asm_handler -.popsection // .rodata -#endif /* CONFIG_RANDOMIZE_BASE */ #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ /* @@ -1347,7 +1452,7 @@ alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 alternative_else_nop_endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 - tramp_alias dst=x5, sym=__sdei_asm_exit_trampoline + tramp_alias dst=x5, sym=__sdei_asm_exit_trampoline, tmp=x3 br x5 #endif ENDPROC(__sdei_asm_handler) diff --git a/arch/arm64/kernel/livepatch.c b/arch/arm64/kernel/livepatch.c index d966d4767cb70ac9f342ef737c2578af4e05be79..235e6f8b6719722a39042f948f2813f9e70453c6 100644 --- a/arch/arm64/kernel/livepatch.c +++ b/arch/arm64/kernel/livepatch.c @@ -251,10 +251,10 @@ int arch_klp_patch_func(struct klp_func *func) if (aarch64_insn_patch_text_nosync((void *)pc, insn)) goto ERR_OUT; } else { - insns[0] = cpu_to_le32(0x92800010 | (((~new_addr) & 0xffff)) << 5); - insns[1] = cpu_to_le32(0xf2a00010 | (((new_addr >> 16) & 0xffff)) << 5); - insns[2] = cpu_to_le32(0xf2c00010 | (((new_addr >> 32) & 0xffff)) << 5); - insns[3] = cpu_to_le32(0xd61f0200); + insns[0] = 0x92800010 | (((~new_addr) & 0xffff)) << 5; + insns[1] = 0xf2a00010 | (((new_addr >> 16) & 0xffff)) << 5; + insns[2] = 0xf2c00010 | (((new_addr >> 32) & 0xffff)) << 5; + insns[3] = 0xd61f0200; for (i = 0; i < LJMP_INSN_SIZE; i++) { if (aarch64_insn_patch_text_nosync(((u32 *)pc) + i, insns[i])) goto ERR_OUT; @@ -328,10 +328,10 @@ void arch_klp_unpatch_func(struct klp_func *func) aarch64_insn_patch_text_nosync((void *)pc, insn); } else { - insns[0] = cpu_to_le32(0x92800010 | (((~new_addr) & 0xffff)) << 5); - insns[1] = cpu_to_le32(0xf2a00010 | (((new_addr >> 16) & 0xffff)) << 5); - insns[2] = cpu_to_le32(0xf2c00010 | (((new_addr >> 32) & 0xffff)) << 5); - insns[3] = cpu_to_le32(0xd61f0200); + insns[0] = 0x92800010 | (((~new_addr) & 0xffff)) << 5; + insns[1] = 0xf2a00010 | (((new_addr >> 16) & 0xffff)) << 5; + insns[2] = 0xf2c00010 | (((new_addr >> 32) & 0xffff)) << 5; + insns[3] = 0xd61f0200; for (i = 0; i < LJMP_INSN_SIZE; i++) aarch64_insn_patch_text_nosync(((u32 *)pc) + i, insns[i]); diff --git a/arch/arm64/kernel/mpam/mpam_device.c b/arch/arm64/kernel/mpam/mpam_device.c index 4e882e81cf416c7e96b0444044a92cf047ce545a..6455c69f132fd47fff3a872420bc4b446bf44d12 100644 --- a/arch/arm64/kernel/mpam/mpam_device.c +++ b/arch/arm64/kernel/mpam/mpam_device.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "mpam_resource.h" #include "mpam_device.h" @@ -621,7 +622,7 @@ static void mpam_failed(struct work_struct *work) mutex_unlock(&mpam_cpuhp_lock); } -static struct mpam_device * __init +static struct mpam_device * mpam_device_alloc(struct mpam_component *comp) { struct mpam_device *dev; @@ -656,7 +657,7 @@ static void mpam_devices_destroy(struct mpam_component *comp) } } -static struct mpam_component * __init mpam_component_alloc(int id) +static struct mpam_component *mpam_component_alloc(int id) { struct mpam_component *comp; @@ -694,7 +695,7 @@ struct mpam_component *mpam_component_get(struct mpam_class *class, int id, return comp; } -static struct mpam_class * __init mpam_class_alloc(u8 level_idx, +static struct mpam_class *mpam_class_alloc(u8 level_idx, enum mpam_class_types type) { struct mpam_class *class; @@ -733,7 +734,7 @@ static void mpam_class_destroy(struct mpam_class *class) } } -static struct mpam_class * __init mpam_class_get(u8 level_idx, +static struct mpam_class *mpam_class_get(u8 level_idx, enum mpam_class_types type, bool alloc) { @@ -1718,10 +1719,9 @@ static const struct of_device_id arm_mpam_of_device_ids[] = { { } }; -static int of_mpam_parse_irq(struct platform_device *pdev, +static int of_mpam_parse_irq(struct device_node *node, struct mpam_device *dev) { - struct device_node *node = pdev->dev.of_node; u32 overflow_interrupt, overflow_flags; u32 error_interrupt, error_interrupt_flags; @@ -1736,12 +1736,12 @@ static int of_mpam_parse_irq(struct platform_device *pdev, error_interrupt, error_interrupt_flags); } -static int of_mpam_parse_cache(struct platform_device *pdev) +static int of_mpam_parse_cache(struct platform_device *pdev, + struct device_node *node) { struct mpam_device *dev; - struct device_node *node = pdev->dev.of_node; int cache_level, cache_id; - struct resource *res; + u64 reg_value[2]; if (of_property_read_u32(node, "cache-level", &cache_level)) { dev_err(&pdev->dev, "missing cache level property\n"); @@ -1754,27 +1754,27 @@ static int of_mpam_parse_cache(struct platform_device *pdev) } /* Base address */ - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) { + if (of_property_read_u64_array(node, "reg", reg_value, 2)) { dev_err(&pdev->dev, "missing io resource property\n"); return -EINVAL; } - dev = mpam_device_create_cache(cache_level, cache_id, NULL, res->start); + dev = mpam_device_create_cache(cache_level, cache_id, NULL, + reg_value[0]); if (IS_ERR(dev)) { dev_err(&pdev->dev, "Failed to create cache node\n"); return -EINVAL; } - return of_mpam_parse_irq(pdev, dev); + return of_mpam_parse_irq(node, dev); } -static int of_mpam_parse_memory(struct platform_device *pdev) +static int of_mpam_parse_memory(struct platform_device *pdev, + struct device_node *node) { struct mpam_device *dev; - struct device_node *node = pdev->dev.of_node; int numa_id; - struct resource *res; + u64 reg_value[2]; if (of_property_read_u32(node, "numa-node-id", &numa_id)) { dev_err(&pdev->dev, "missing numa node id property\n"); @@ -1782,40 +1782,35 @@ static int of_mpam_parse_memory(struct platform_device *pdev) } /* Base address */ - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) { + if (of_property_read_u64_array(node, "reg", reg_value, 2)) { dev_err(&pdev->dev, "missing io resource property\n"); return -EINVAL; } - dev = mpam_device_create_memory(numa_id, res->start); + dev = mpam_device_create_memory(numa_id, reg_value[0]); if (IS_ERR(dev)) { dev_err(&pdev->dev, "Failed to create memory node\n"); return -EINVAL; } - return of_mpam_parse_irq(pdev, dev); + return of_mpam_parse_irq(node, dev); } -static int of_mpam_parse(struct platform_device *pdev) +static int of_mpam_add_child(struct platform_device *pdev, + struct device_node *node) { - struct device *dev = &pdev->dev; - struct device_node *node = dev->of_node; enum mpam_class_types type; - if (!node || !of_match_node(arm_mpam_of_device_ids, pdev->dev.of_node)) - return -EINVAL; - - if (of_property_read_u32(dev->of_node, "type", &type)) { - dev_err(dev, "missing type property\n"); + if (of_property_read_u32(node, "type", &type)) { + dev_err(&pdev->dev, "missing type property\n"); return -EINVAL; } switch (type) { case MPAM_CLASS_CACHE: - return of_mpam_parse_cache(pdev); + return of_mpam_parse_cache(pdev, node); case MPAM_CLASS_MEMORY: - return of_mpam_parse_memory(pdev); + return of_mpam_parse_memory(pdev, node); default: pr_warn_once("Unknown node type %u.\n", type); return -EINVAL; @@ -1833,6 +1828,9 @@ static int of_mpam_parse(struct platform_device *pdev) static int arm_mpam_device_probe(struct platform_device *pdev) { int ret; + struct device *dev = &pdev->dev; + struct device_node *node = dev->of_node; + struct device_node *child = NULL; if (!cpus_have_const_cap(ARM64_HAS_MPAM)) return 0; @@ -1840,11 +1838,18 @@ static int arm_mpam_device_probe(struct platform_device *pdev) if (!acpi_disabled || mpam_enabled != MPAM_ENABLE_OF) return 0; + if (!node || !of_match_node(arm_mpam_of_device_ids, pdev->dev.of_node)) + return -EINVAL; + ret = mpam_discovery_start(); if (ret) return ret; - ret = of_mpam_parse(pdev); + for_each_available_child_of_node(node, child) { + ret = of_mpam_add_child(pdev, child); + if (ret) + break; + } if (ret) { mpam_discovery_failed(); diff --git a/arch/arm64/kernel/mpam/mpam_resctrl.c b/arch/arm64/kernel/mpam/mpam_resctrl.c index fe2dcf92100f8ae86909cc0404b3cb3eeee6d65e..60d3d8706a38bbff4fb192823120b722f49a8891 100644 --- a/arch/arm64/kernel/mpam/mpam_resctrl.c +++ b/arch/arm64/kernel/mpam/mpam_resctrl.c @@ -839,7 +839,8 @@ static inline unsigned long **__rmid_remap_bmp(u32 col) #define __step_xy_initialize(step, x, y, from) \ (x = from, step = 1, y = 0) #define __step_align(from) \ - (!(from % rmid_remap_matrix.step_size)) + (!(from % (rmid_remap_matrix.step_size * \ + rmid_remap_matrix.step_cnt))) #define __step_overflow(step) \ (__xy_overflow(x, y) || \ (step > rmid_remap_matrix.step_cnt)) @@ -913,7 +914,7 @@ static int is_rmid_remap_bmp_full(unsigned long *bmp) bitmap_full(bmp, rmid_remap_matrix.rows)); } -static int rmid_remap_bmp_find_first_avail_partid(int partid) +static int rmid_remap_bmp_find_step_entry(int partid) { int x, y; unsigned long **bmp; @@ -922,17 +923,18 @@ static int rmid_remap_bmp_find_first_avail_partid(int partid) rmid_remap_matrix.cols) return 0; + /* step entry should be non-occupied and aligned */ bmp = __rmid_remap_bmp(partid); - if (bmp && !is_rmid_remap_bmp_occ(*bmp)) - return partid; + if (bmp) + return (is_rmid_remap_bmp_occ(*bmp) || + !__step_align(partid)) ? -ENOSPC : partid; for_each_rmid_transform_point_from(bmp, x, y, 0) { /* * do not waste partid resource, start - * from step_size aligned position. + * from step aligned position. */ - if (!is_rmid_remap_bmp_occ(*bmp) && - (x % rmid_remap_matrix.step_size) == 0) + if (__step_align(x) && !is_rmid_remap_bmp_occ(*bmp)) return x; } @@ -1026,8 +1028,8 @@ static int __rmid_alloc(int partid, int pmg) if (pmg >= 0) checkpmg = true; - /* traverse from first non-occupied and step_size aligned entry */ - ret = rmid_remap_bmp_find_first_avail_partid(partid); + /* traverse from first non-occupied and step-aligned entry */ + ret = rmid_remap_bmp_find_step_entry(partid); if (ret < 0) goto out; partid = ret; @@ -1135,7 +1137,7 @@ void closid_free(int closid) * Choose a width for the resource name and resource data based on the * resource that has widest name and cbm. */ -static __init void mpam_init_padding(void) +static void mpam_init_padding(void) { int cl; struct mpam_resctrl_res *res; diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index d6050c6e65bc1725f0d691f902d13cd190d02378..2b965dd67881cca7ba78775ff9c85df997727e58 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -251,7 +251,7 @@ ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1)) <= SZ_4K, "Hibernate exit text too big or misaligned") #endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE, +ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) <= 3*PAGE_SIZE, "Entry trampoline text too big") #endif /* diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index b1f14f736962f938911088f63d1f833f9a7c2c81..454012f16c00d81bddee7f2e1443ed689a90f179 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -111,6 +111,10 @@ el1_hvc_guest: /* ARM_SMCCC_ARCH_WORKAROUND_2 handling */ eor w1, w1, #(ARM_SMCCC_ARCH_WORKAROUND_1 ^ \ ARM_SMCCC_ARCH_WORKAROUND_2) + cbz w1, wa_epilogue + + eor w1, w1, #(ARM_SMCCC_ARCH_WORKAROUND_2 ^ \ + ARM_SMCCC_ARCH_WORKAROUND_3) cbnz w1, el1_trap #ifdef CONFIG_ARM64_SSBD @@ -329,4 +333,64 @@ ENTRY(__smccc_workaround_1_smc_start) ldp x0, x1, [sp, #(8 * 2)] add sp, sp, #(8 * 4) ENTRY(__smccc_workaround_1_smc_end) + +ENTRY(__smccc_workaround_3_smc_start) + esb + sub sp, sp, #(8 * 4) + stp x2, x3, [sp, #(8 * 0)] + stp x0, x1, [sp, #(8 * 2)] + mov w0, #ARM_SMCCC_ARCH_WORKAROUND_3 + smc #0 + ldp x2, x3, [sp, #(8 * 0)] + ldp x0, x1, [sp, #(8 * 2)] + add sp, sp, #(8 * 4) +ENTRY(__smccc_workaround_3_smc_end) + +ENTRY(__spectre_bhb_loop_k8_start) + esb + sub sp, sp, #(8 * 2) + stp x0, x1, [sp, #(8 * 0)] + mov x0, #8 +2: b . + 4 + subs x0, x0, #1 + b.ne 2b + dsb nsh + isb + ldp x0, x1, [sp, #(8 * 0)] + add sp, sp, #(8 * 2) +ENTRY(__spectre_bhb_loop_k8_end) + +ENTRY(__spectre_bhb_loop_k24_start) + esb + sub sp, sp, #(8 * 2) + stp x0, x1, [sp, #(8 * 0)] + mov x0, #24 +2: b . + 4 + subs x0, x0, #1 + b.ne 2b + dsb nsh + isb + ldp x0, x1, [sp, #(8 * 0)] + add sp, sp, #(8 * 2) +ENTRY(__spectre_bhb_loop_k24_end) + +ENTRY(__spectre_bhb_loop_k32_start) + esb + sub sp, sp, #(8 * 2) + stp x0, x1, [sp, #(8 * 0)] + mov x0, #32 +2: b . + 4 + subs x0, x0, #1 + b.ne 2b + dsb nsh + isb + ldp x0, x1, [sp, #(8 * 0)] + add sp, sp, #(8 * 2) +ENTRY(__spectre_bhb_loop_k32_end) + +ENTRY(__spectre_bhb_clearbhb_start) + esb + clearbhb + isb +ENTRY(__spectre_bhb_clearbhb_end) #endif diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index ae473600cb9752805a50ef00b9a19514cdab90f4..6473c7c96b14d11e324fc27ccb8d03ea51c7a2c6 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -35,6 +35,7 @@ #include #include #include +#include /* Check whether the FP regs were dirtied while in the host-side run loop: */ static bool __hyp_text update_fp_enabled(struct kvm_vcpu *vcpu) @@ -153,10 +154,13 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu) static void deactivate_traps_vhe(void) { - extern char vectors[]; /* kernel exception vectors */ + const char *host_vectors = vectors; write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2); write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1); - write_sysreg(vectors, vbar_el1); + + if (!arm64_kernel_unmapped_at_el0()) + host_vectors = __this_cpu_read(this_cpu_vector); + write_sysreg(host_vectors, vbar_el1); } NOKPROBE_SYMBOL(deactivate_traps_vhe); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 29496548c56b9e2e554dab3651f8cac2bd45f484..25caa9ec95e4ce532683a177b3a0ba3531ce02c8 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1325,7 +1325,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* CRm=6 */ ID_SANITISED(ID_AA64ISAR0_EL1), ID_SANITISED(ID_AA64ISAR1_EL1), - ID_UNALLOCATED(6,2), + ID_SANITISED(ID_AA64ISAR2_EL1), ID_UNALLOCATED(6,3), ID_UNALLOCATED(6,4), ID_UNALLOCATED(6,5), diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 35a103c7c22bf7fb8c5cb1e918682063767f1d57..e7cc878a0b05f3b6d2b2a5f345bf036121109afa 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -310,6 +310,7 @@ unsigned long mm_context_get(struct mm_struct *mm) return asid; } +EXPORT_SYMBOL_GPL(mm_context_get); void mm_context_put(struct mm_struct *mm) { @@ -325,6 +326,7 @@ void mm_context_put(struct mm_struct *mm) raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); } +EXPORT_SYMBOL_GPL(mm_context_put); /* Errata workaround post TTBRx_EL1 update. */ asmlinkage void post_ttbr_update_workaround(void) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 300972b01939b3a0a3aba01c8fb33431aa8fbbae..fa53bee52f87ca8e6a5f48006ebf53b7d754fd70 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -541,6 +541,8 @@ early_param("rodata", parse_rodata); #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 static int __init map_entry_trampoline(void) { + int i; + pgprot_t prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start); @@ -549,11 +551,15 @@ static int __init map_entry_trampoline(void) /* Map only the text into the trampoline page table */ memset(tramp_pg_dir, 0, PGD_SIZE); - __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE, - prot, pgd_pgtable_alloc, 0); + __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, + entry_tramp_text_size(), prot, pgd_pgtable_alloc, + 0); /* Map both the text and data into the kernel page table */ - __set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot); + for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++) + __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i, + pa_start + i * PAGE_SIZE, prot); + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { extern char __entry_tramp_data_start[]; diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 4570a54ac1d90ceaece42292ab8495e6cdc0ef66..1a6ffb5515ed16b8136ba108620fd3fad2573397 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -53,6 +53,7 @@ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ #define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */ +#define MAP_ALIGN 0x2000000 /* create an aligned mapping */ /* * Flags for msync diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 06857eb1bee8f165b61ea6385842c9d0b9fcd91f..8d88c457fe100d2b58fff8b9cb80ef1369277558 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -29,6 +29,7 @@ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ #define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */ +#define MAP_ALIGN 0x2000000 /* create an aligned mapping */ #define MS_SYNC 1 /* synchronous memory sync */ #define MS_ASYNC 2 /* sync memory asynchronously */ diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h index 24354f792b00da13f1b921ee96cfe270d35f9585..802cc3fe1358cee39f1b126bbcbc37d2e96b9a04 100644 --- a/arch/powerpc/include/uapi/asm/mman.h +++ b/arch/powerpc/include/uapi/asm/mman.h @@ -31,6 +31,7 @@ #define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ #define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */ +#define MAP_ALIGN 0x2000000 /* create an aligned mapping */ /* Override any generic PKEY permission defines */ #define PKEY_DISABLE_EXECUTE 0x4 diff --git a/arch/sparc/include/uapi/asm/mman.h b/arch/sparc/include/uapi/asm/mman.h index 214abe17f44a0d7d1e93b4131dd00f202377a02c..953528235c2e52f4c1ca0159bea29eb7681beb8c 100644 --- a/arch/sparc/include/uapi/asm/mman.h +++ b/arch/sparc/include/uapi/asm/mman.h @@ -28,6 +28,7 @@ #define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ #define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */ +#define MAP_ALIGN 0x2000000 /* create an aligned mapping */ #endif /* _UAPI__SPARC_MMAN_H__ */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ccb5e3486aee79b485d05f0e99628461fc9019ec..ca651113bea2dcb061ac666aad807becf063c5ee 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -312,7 +312,6 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) */ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ - UNWIND_HINT_EMPTY POP_REGS pop_rdi=0 skip_r11rcx=1 /* @@ -321,6 +320,7 @@ syscall_return_via_sysret: */ movq %rsp, %rdi movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + UNWIND_HINT_EMPTY pushq RSP-RDI(%rdi) /* RSP */ pushq (%rdi) /* RDI */ @@ -700,6 +700,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) */ movq %rsp, %rdi movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + UNWIND_HINT_EMPTY /* Copy the IRET frame to the trampoline stack. */ pushq 6*8(%rdi) /* SS */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 0a4145100c3a1ed43d1fd33a9346494764091ae2..a75ef95455d989be75b737f9e52c5728626d2c4c 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -224,7 +224,7 @@ #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ #define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index e3f70c60e8ccd0dd6b60f9e666730bf5e34f3094..ddcd4d5eecece02243f0dc2a503466ce631f6318 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -119,7 +119,7 @@ ANNOTATE_NOSPEC_ALTERNATIVE ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *\reg), \ __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *\reg), X86_FEATURE_RETPOLINE_AMD + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *\reg), X86_FEATURE_RETPOLINE_LFENCE #else jmp *\reg #endif @@ -130,7 +130,7 @@ ANNOTATE_NOSPEC_ALTERNATIVE ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *\reg), \ __stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *\reg), X86_FEATURE_RETPOLINE_AMD + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *\reg), X86_FEATURE_RETPOLINE_LFENCE #else call *\reg #endif @@ -181,7 +181,7 @@ "lfence;\n" \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ - X86_FEATURE_RETPOLINE_AMD) + X86_FEATURE_RETPOLINE_LFENCE) # define THUNK_TARGET(addr) [thunk_target] "r" (addr) #else /* CONFIG_X86_32 */ @@ -211,7 +211,7 @@ "lfence;\n" \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ - X86_FEATURE_RETPOLINE_AMD) + X86_FEATURE_RETPOLINE_LFENCE) # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif @@ -223,9 +223,11 @@ /* The Spectre V2 mitigation variants */ enum spectre_v2_mitigation { SPECTRE_V2_NONE, - SPECTRE_V2_RETPOLINE_GENERIC, - SPECTRE_V2_RETPOLINE_AMD, - SPECTRE_V2_IBRS_ENHANCED, + SPECTRE_V2_RETPOLINE, + SPECTRE_V2_LFENCE, + SPECTRE_V2_EIBRS, + SPECTRE_V2_EIBRS_RETPOLINE, + SPECTRE_V2_EIBRS_LFENCE, }; /* The indirect branch speculation control variants */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index fe6f9e7f15bb5877837bcba92acf21e226011803..1d79485ae9721dc2c8976c5fb7082adfd268fb37 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1353,8 +1353,8 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) #endif #endif -#define PKRU_AD_BIT 0x1 -#define PKRU_WD_BIT 0x2 +#define PKRU_AD_BIT 0x1u +#define PKRU_WD_BIT 0x2u #define PKRU_BITS_PER_PKEY 2 static inline bool __pkru_allows_read(u32 pkru, u16 pkey) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index e119c78581d14562bfc5159b3099b8afd9de4135..307aa9454df30b76a5d87f7cfd7bbe9945e5a61f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -763,6 +763,7 @@ extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); extern void load_percpu_segment(int); extern void cpu_init(void); +extern void cr4_init(void); static inline unsigned long get_debugctlmsr(void) { diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index b684dfdaaec1cbe82cfa304ab1621b29178e444e..af5f0d9a99511edeba28d5ece62d68753f3715ff 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -6,6 +6,8 @@ #ifdef __KERNEL__ #include +#include +#include /* * Volatile isn't enough to prevent the compiler from reordering the @@ -16,6 +18,8 @@ */ extern unsigned long __force_order; +void native_write_cr0(unsigned long val); + static inline unsigned long native_read_cr0(void) { unsigned long val; @@ -23,11 +27,6 @@ static inline unsigned long native_read_cr0(void) return val; } -static inline void native_write_cr0(unsigned long val) -{ - asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order)); -} - static inline unsigned long native_read_cr2(void) { unsigned long val; @@ -72,10 +71,7 @@ static inline unsigned long native_read_cr4(void) return val; } -static inline void native_write_cr4(unsigned long val) -{ - asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order)); -} +void native_write_cr4(unsigned long val); #ifdef CONFIG_X86_64 static inline unsigned long native_read_cr8(void) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 54ec900bd259a92eb4c090eb59ee553528d29385..4651e9db46a6788ece41779152d10d694e4536f7 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "cpu.h" @@ -607,6 +608,32 @@ static inline const char *spectre_v2_module_string(void) static inline const char *spectre_v2_module_string(void) { return ""; } #endif +#define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n" +#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" +#define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n" + +#ifdef CONFIG_BPF_SYSCALL +void unpriv_ebpf_notify(int new_state) +{ + if (new_state) + return; + + /* Unprivileged eBPF is enabled */ + + switch (spectre_v2_enabled) { + case SPECTRE_V2_EIBRS: + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + break; + case SPECTRE_V2_EIBRS_LFENCE: + if (sched_smt_active()) + pr_err(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + break; + default: + break; + } +} +#endif + static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt); @@ -621,7 +648,10 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_FORCE, SPECTRE_V2_CMD_RETPOLINE, SPECTRE_V2_CMD_RETPOLINE_GENERIC, - SPECTRE_V2_CMD_RETPOLINE_AMD, + SPECTRE_V2_CMD_RETPOLINE_LFENCE, + SPECTRE_V2_CMD_EIBRS, + SPECTRE_V2_CMD_EIBRS_RETPOLINE, + SPECTRE_V2_CMD_EIBRS_LFENCE, }; enum spectre_v2_user_cmd { @@ -694,6 +724,13 @@ spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) return SPECTRE_V2_USER_CMD_AUTO; } +static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) +{ + return (mode == SPECTRE_V2_EIBRS || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_EIBRS_LFENCE); +} + static void __init spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) { @@ -756,10 +793,12 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) } /* - * If enhanced IBRS is enabled or SMT impossible, STIBP is not + * If no STIBP, enhanced IBRS is enabled or SMT impossible, STIBP is not * required. */ - if (!smt_possible || spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + if (!boot_cpu_has(X86_FEATURE_STIBP) || + !smt_possible || + spectre_v2_in_eibrs_mode(spectre_v2_enabled)) return; /* @@ -771,12 +810,6 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) mode = SPECTRE_V2_USER_STRICT_PREFERRED; - /* - * If STIBP is not available, clear the STIBP mode. - */ - if (!boot_cpu_has(X86_FEATURE_STIBP)) - mode = SPECTRE_V2_USER_NONE; - spectre_v2_user_stibp = mode; set_mode: @@ -785,9 +818,11 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", - [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", - [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", - [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", + [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", + [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines", }; static const struct { @@ -798,8 +833,12 @@ static const struct { { "off", SPECTRE_V2_CMD_NONE, false }, { "on", SPECTRE_V2_CMD_FORCE, true }, { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, - { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false }, + { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, + { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, + { "eibrs", SPECTRE_V2_CMD_EIBRS, false }, + { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, + { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, { "auto", SPECTRE_V2_CMD_AUTO, false }, }; @@ -836,17 +875,30 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) } if ((cmd == SPECTRE_V2_CMD_RETPOLINE || - cmd == SPECTRE_V2_CMD_RETPOLINE_AMD || - cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) && + cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && !IS_ENABLED(CONFIG_RETPOLINE)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); + pr_err("%s selected but not compiled in. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if ((cmd == SPECTRE_V2_CMD_EIBRS || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n", + mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } - if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD && - boot_cpu_data.x86_vendor != X86_VENDOR_HYGON && - boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { - pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); + if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && + !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", + mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } @@ -855,6 +907,16 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return cmd; } +static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) +{ + if (!IS_ENABLED(CONFIG_RETPOLINE)) { + pr_err("Kernel not compiled with retpoline; no mitigation available!"); + return SPECTRE_V2_NONE; + } + + return SPECTRE_V2_RETPOLINE; +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -875,49 +937,64 @@ static void __init spectre_v2_select_mitigation(void) case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - mode = SPECTRE_V2_IBRS_ENHANCED; - /* Force it so VMEXIT will restore correctly */ - x86_spec_ctrl_base |= SPEC_CTRL_IBRS; - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); - goto specv2_set_mode; + mode = SPECTRE_V2_EIBRS; + break; } - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_auto; + + mode = spectre_v2_select_retpoline(); break; - case SPECTRE_V2_CMD_RETPOLINE_AMD: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_amd; + + case SPECTRE_V2_CMD_RETPOLINE_LFENCE: + pr_err(SPECTRE_V2_LFENCE_MSG); + mode = SPECTRE_V2_LFENCE; break; + case SPECTRE_V2_CMD_RETPOLINE_GENERIC: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_generic; + mode = SPECTRE_V2_RETPOLINE; break; + case SPECTRE_V2_CMD_RETPOLINE: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_auto; + mode = spectre_v2_select_retpoline(); + break; + + case SPECTRE_V2_CMD_EIBRS: + mode = SPECTRE_V2_EIBRS; + break; + + case SPECTRE_V2_CMD_EIBRS_LFENCE: + mode = SPECTRE_V2_EIBRS_LFENCE; + break; + + case SPECTRE_V2_CMD_EIBRS_RETPOLINE: + mode = SPECTRE_V2_EIBRS_RETPOLINE; break; } - pr_err("Spectre mitigation: kernel not compiled with retpoline; no mitigation available!"); - return; -retpoline_auto: - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - retpoline_amd: - if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n"); - goto retpoline_generic; - } - mode = SPECTRE_V2_RETPOLINE_AMD; - setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD); - setup_force_cpu_cap(X86_FEATURE_RETPOLINE); - } else { - retpoline_generic: - mode = SPECTRE_V2_RETPOLINE_GENERIC; + if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + + if (spectre_v2_in_eibrs_mode(mode)) { + /* Force it so VMEXIT will restore correctly */ + x86_spec_ctrl_base |= SPEC_CTRL_IBRS; + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + } + + switch (mode) { + case SPECTRE_V2_NONE: + case SPECTRE_V2_EIBRS: + break; + + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_EIBRS_LFENCE: + setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE); + /* fallthrough */ + + case SPECTRE_V2_RETPOLINE: + case SPECTRE_V2_EIBRS_RETPOLINE: setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + break; } -specv2_set_mode: spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); @@ -943,7 +1020,7 @@ static void __init spectre_v2_select_mitigation(void) * the CPU supports Enhanced IBRS, kernel might un-intentionally not * enable IBRS around firmware calls. */ - if (boot_cpu_has(X86_FEATURE_IBRS) && mode != SPECTRE_V2_IBRS_ENHANCED) { + if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_eibrs_mode(mode)) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } @@ -1013,6 +1090,10 @@ void arch_smt_update(void) { mutex_lock(&spec_ctrl_mutex); + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: break; @@ -1588,7 +1669,7 @@ static ssize_t tsx_async_abort_show_state(char *buf) static char *stibp_state(void) { - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + if (spectre_v2_in_eibrs_mode(spectre_v2_enabled)) return ""; switch (spectre_v2_user_stibp) { @@ -1618,6 +1699,27 @@ static char *ibpb_state(void) return ""; } +static ssize_t spectre_v2_show_state(char *buf) +{ + if (spectre_v2_enabled == SPECTRE_V2_LFENCE) + return sprintf(buf, "Vulnerable: LFENCE\n"); + + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); + + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); + + return sprintf(buf, "%s%s%s%s%s%s\n", + spectre_v2_strings[spectre_v2_enabled], + ibpb_state(), + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + stibp_state(), + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + spectre_v2_module_string()); +} + static ssize_t srbds_show_state(char *buf) { return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); @@ -1643,12 +1745,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); case X86_BUG_SPECTRE_V2: - return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], - ibpb_state(), - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", - stibp_state(), - boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", - spectre_v2_module_string()); + return spectre_v2_show_state(buf); case X86_BUG_SPEC_STORE_BYPASS: return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6fe28a1b18671e298dde3582f50a3714b36aa34f..a0306327e7ed38d029a0b9ba9eef61eb3f7df0b6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -365,6 +365,77 @@ static __always_inline void setup_umip(struct cpuinfo_x86 *c) cr4_clear_bits(X86_CR4_UMIP); } +static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning); +static unsigned long cr4_pinned_bits __ro_after_init; + +void native_write_cr0(unsigned long val) +{ + unsigned long bits_missing = 0; + +set_register: + asm volatile("mov %0,%%cr0": "+r" (val), "+m" (__force_order)); + + if (static_branch_likely(&cr_pinning)) { + if (unlikely((val & X86_CR0_WP) != X86_CR0_WP)) { + bits_missing = X86_CR0_WP; + val |= bits_missing; + goto set_register; + } + /* Warn after we've set the missing bits. */ + WARN_ONCE(bits_missing, "CR0 WP bit went missing!?\n"); + } +} +EXPORT_SYMBOL(native_write_cr0); + +void native_write_cr4(unsigned long val) +{ + unsigned long bits_missing = 0; + +set_register: + asm volatile("mov %0,%%cr4": "+r" (val), "+m" (cr4_pinned_bits)); + + if (static_branch_likely(&cr_pinning)) { + if (unlikely((val & cr4_pinned_bits) != cr4_pinned_bits)) { + bits_missing = ~val & cr4_pinned_bits; + val |= bits_missing; + goto set_register; + } + /* Warn after we've set the missing bits. */ + WARN_ONCE(bits_missing, "CR4 bits went missing: %lx!?\n", + bits_missing); + } +} +EXPORT_SYMBOL(native_write_cr4); + +void cr4_init(void) +{ + unsigned long cr4 = __read_cr4(); + + if (boot_cpu_has(X86_FEATURE_PCID)) + cr4 |= X86_CR4_PCIDE; + if (static_branch_likely(&cr_pinning)) + cr4 |= cr4_pinned_bits; + + __write_cr4(cr4); + + /* Initialize cr4 shadow for this CPU. */ + this_cpu_write(cpu_tlbstate.cr4, cr4); +} + +/* + * Once CPU feature detection is finished (and boot params have been + * parsed), record any of the sensitive CR bits that are set, and + * enable CR pinning. + */ +static void __init setup_cr_pinning(void) +{ + unsigned long mask; + + mask = (X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP); + cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & mask; + static_key_enable(&cr_pinning.key); +} + /* * Protection Keys are not available in 32-bit mode. */ @@ -1588,6 +1659,7 @@ void __init identify_boot_cpu(void) #endif cpu_detect_tlb(&boot_cpu_data); tsx_init(); + setup_cr_pinning(); } void identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -1831,12 +1903,6 @@ void cpu_init(void) wait_for_master_cpu(cpu); - /* - * Initialize the CR4 shadow before doing anything that could - * try to read it. - */ - cr4_init_shadow(); - if (cpu) load_ucode_ap(); @@ -1936,12 +2002,6 @@ void cpu_init(void) wait_for_master_cpu(cpu); - /* - * Initialize the CR4 shadow before doing anything that could - * try to read it. - */ - cr4_init_shadow(); - show_ucode_info_early(); pr_info("Initializing CPU#%d\n", cpu); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 08b5f534fabb3dd75198f6eafda386d2547fe302..ee697fa8847d6f2d32baef8bc1218cb7a9a39f82 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -217,17 +217,11 @@ static void notrace start_secondary(void *unused) * before cpu_init(), SMP booting is too fragile that we want to * limit the things done here to the most necessary things. */ - if (boot_cpu_has(X86_FEATURE_PCID)) - __write_cr4(__read_cr4() | X86_CR4_PCIDE); + cr4_init(); #ifdef CONFIG_X86_32 /* switch away from the initial page table */ load_cr3(swapper_pg_dir); - /* - * Initialize the CR4 shadow before doing anything that could - * try to read it. - */ - cr4_init_shadow(); __flush_tlb_all(); #endif load_current_idt(); diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index e3b18ad49889afc5ae35d2e2796aecd108a93819..32a9c22121249672cc9256c7d368a0787b129a9b 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -57,6 +57,7 @@ static void cpu_bringup(void) { int cpu; + cr4_init(); cpu_init(); touch_softlockup_watchdog(); preempt_disable(); diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 2c9d7056023827260ccc77517d8b29dc6d36a9f2..97239cce3ffcb3b41b0f6cc675ad85653f81b214 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -67,6 +67,8 @@ # define MAP_UNINITIALIZED 0x0 /* Don't support this flag */ #endif +#define MAP_ALIGN 0x2000000 /* create an aligned mapping */ + /* * Flags for msync */ diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 0b96220d0efd4180e72982bc23d7e5ee713bd008..2e22a3f7466a86b75c542dd55e6b23d0b4e73c58 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -399,7 +399,7 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); - bip->bip_iter.bi_sector += bytes_done >> 9; + bip->bip_iter.bi_sector += bio_integrity_intervals(bi, bytes_done >> 9); bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); } EXPORT_SYMBOL(bio_integrity_advance); diff --git a/block/bio.c b/block/bio.c index d94243411ef308b69e085434588d0ef05fd79ca8..6457cbfa70cc9db1b52886b2ba67bbc71c06a87d 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1528,7 +1528,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, if (bytes > len) bytes = len; - page = alloc_page(q->bounce_gfp | gfp_mask); + page = alloc_page(q->bounce_gfp | __GFP_ZERO | gfp_mask); if (!page) goto cleanup; @@ -1706,9 +1706,13 @@ void generic_end_io_acct(struct request_queue *q, int req_op, const int sgrp = op_stat_group(req_op); int cpu = part_stat_lock(); - update_io_ticks(cpu, part, now); + if (precise_iostat) { + part_round_stats(q, cpu, part); + } else { + update_io_ticks(cpu, part, now); + part_stat_add(cpu, part, time_in_queue, duration); + } part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration)); - part_stat_add(cpu, part, time_in_queue, duration); part_dec_in_flight(q, part, op_is_write(req_op)); part_stat_unlock(); diff --git a/block/blk-core.c b/block/blk-core.c index 41d0b09e9a67317b9571cf1c01334a90fd3361e0..a5d80ab9117070f612f0e7962d21db05cbb47335 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -56,6 +56,20 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); DEFINE_IDA(blk_queue_ida); +bool precise_iostat; +static int __init precise_iostat_setup(char *str) +{ + bool precise; + + if (!strtobool(str, &precise)) { + precise_iostat = precise; + pr_info("precise iostat %d\n", precise_iostat); + } + + return 1; +} +__setup("precise_iostat=", precise_iostat_setup); + /* * For the allocated request tables */ @@ -1700,8 +1714,13 @@ static void part_round_stats_single(struct request_queue *q, int cpu, struct hd_struct *part, unsigned long now, unsigned int inflight) { - if (inflight) + if (inflight) { + if (precise_iostat) { + __part_stat_add(cpu, part, time_in_queue, + inflight * (now - part->stamp)); + } __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); + } part->stamp = now; } @@ -2650,8 +2669,10 @@ static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, * blk_insert_cloned_request - Helper for stacking drivers to submit a request * @q: the queue to submit the request * @rq: the request being queued + * @precise: true if io account with start and done will be balanced */ -blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) +blk_status_t __blk_insert_cloned_request(struct request_queue *q, + struct request *rq, bool precise) { unsigned long flags; int where = ELEVATOR_INSERT_BACK; @@ -2674,7 +2695,16 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * * bypass a potential scheduler on the bottom device for * insert. */ - return blk_mq_request_issue_directly(rq); + ret = blk_mq_request_issue_directly(rq); + if (ret && precise) { + u64 now = 0; + + if (blk_mq_need_time_stamp(rq)) + now = ktime_get_ns(); + + blk_account_io_done(rq, now); + } + return ret; } spin_lock_irqsave(q->queue_lock, flags); @@ -2699,6 +2729,13 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * return BLK_STS_OK; } +EXPORT_SYMBOL_GPL(__blk_insert_cloned_request); + +blk_status_t blk_insert_cloned_request(struct request_queue *q, + struct request *rq) +{ + return __blk_insert_cloned_request(q, rq, false); +} EXPORT_SYMBOL_GPL(blk_insert_cloned_request); /** @@ -2771,10 +2808,15 @@ void blk_account_io_done(struct request *req, u64 now) cpu = part_stat_lock(); part = req->part; - update_io_ticks(cpu, part, jiffies); + if (!precise_iostat) { + update_io_ticks(cpu, part, jiffies); + part_stat_add(cpu, part, time_in_queue, + nsecs_to_jiffies64(now - req->start_time_ns)); + } else { + part_round_stats(req->q, cpu, part); + } part_stat_inc(cpu, part, ios[sgrp]); part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns); - part_stat_add(cpu, part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns)); part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); diff --git a/block/blk-merge.c b/block/blk-merge.c index 4c17c1031e34f59353a6d561654ef0d74b71744d..d2fabe1fdf3264d9cf8bb47888da7a30a6a8ccb6 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -669,6 +669,8 @@ static void blk_account_io_merge(struct request *req) cpu = part_stat_lock(); part = req->part; + if (precise_iostat) + part_round_stats(req->q, cpu, part); part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); diff --git a/block/blk-mq.c b/block/blk-mq.c index 0732bcc65f8899523287354e7798caa8b8f8f92c..9604d2b39745cb5b5637cd174641b4f78315a277 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2459,12 +2459,16 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, if (!hctx->fq) goto free_bitmap; - if (hctx->flags & BLK_MQ_F_BLOCKING) - init_srcu_struct(hctx->srcu); + if (hctx->flags & BLK_MQ_F_BLOCKING) { + if (init_srcu_struct(hctx->srcu) != 0) + goto free_flush_queue; + } blk_mq_hctx_kobj_init(hctx); return hctx; + free_flush_queue: + blk_free_flush_queue(hctx->fq); free_bitmap: sbitmap_free(&hctx->ctx_map); free_ctxs: diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 1e6d5631669b96142e2d2e9ad78281a9a0f7c0f5..95d28cf8ff0f7636c3ad8d5082cf0909a2d8e605 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2422,8 +2422,11 @@ void blk_throtl_drain(struct request_queue *q) struct blkcg_gq *blkg; struct cgroup_subsys_state *pos_css; struct bio *bio; + struct bio_list bio_list_on_stack; int rw; + bio_list_init(&bio_list_on_stack); + queue_lockdep_assert_held(q); rcu_read_lock(); @@ -2440,12 +2443,16 @@ void blk_throtl_drain(struct request_queue *q) tg_drain_bios(&td->service_queue); rcu_read_unlock(); - spin_unlock_irq(q->queue_lock); /* all bios now should be in td->service_queue, issue them */ for (rw = READ; rw <= WRITE; rw++) while ((bio = throtl_pop_queued(&td->service_queue.queued[rw], NULL))) + bio_list_add(&bio_list_on_stack, bio); + spin_unlock_irq(q->queue_lock); + + if (!bio_list_empty(&bio_list_on_stack)) + while ((bio = bio_list_pop(&bio_list_on_stack))) generic_make_request(bio); spin_lock_irq(q->queue_lock); diff --git a/block/blk.h b/block/blk.h index e496e26630f71280ee82e28c2c619a29a3fa5d39..dde2141a32ddecb398d44ad73ef2310621ddde3f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -405,6 +405,15 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) return current->io_context; } +/* + * Only need start/end time stamping if we have stats enabled, or using + * an IO scheduler. + */ +static inline bool blk_mq_need_time_stamp(struct request *rq) +{ + return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator; +} + /* * Internal throttling interface */ diff --git a/block/genhd.c b/block/genhd.c index 183612cbbd6b77001f7878614ff8c68bfe95e60d..fc24384e2c85fe5d41035268902a7e3a78281d9f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -24,6 +24,7 @@ #include "blk.h" +extern bool precise_iostat; static DEFINE_MUTEX(block_class_lock); struct kobject *block_depr; @@ -47,7 +48,7 @@ static void disk_release_events(struct gendisk *disk); void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { - if (q->mq_ops) + if (!precise_iostat && q->mq_ops) return; atomic_inc(&part->in_flight[rw]); @@ -57,7 +58,7 @@ void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { - if (q->mq_ops) + if (!precise_iostat && q->mq_ops) return; atomic_dec(&part->in_flight[rw]); @@ -68,7 +69,7 @@ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) void part_in_flight(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]) { - if (q->mq_ops) { + if (!precise_iostat && q->mq_ops) { blk_mq_in_flight(q, part, inflight); return; } @@ -85,7 +86,7 @@ void part_in_flight(struct request_queue *q, struct hd_struct *part, void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]) { - if (q->mq_ops) { + if (!precise_iostat && q->mq_ops) { blk_mq_in_flight_rw(q, part, inflight); return; } @@ -1352,6 +1353,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) struct hd_struct *hd; char buf[BDEVNAME_SIZE]; unsigned int inflight[2]; + int cpu; /* if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) @@ -1363,6 +1365,12 @@ static int diskstats_show(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { + if (precise_iostat) { + cpu = part_stat_lock(); + part_round_stats(gp->queue, cpu, hd); + part_stat_unlock(); + } + part_in_flight(gp->queue, hd, inflight); seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u " diff --git a/block/partition-generic.c b/block/partition-generic.c index 739c0cc5fd2221ea028caab6ab4d385faa993e07..c4ac7a8c77dc5dd5937794a21b9909f3908a5bad 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "partitions/check.h" @@ -121,6 +122,13 @@ ssize_t part_stat_show(struct device *dev, struct hd_struct *p = dev_to_part(dev); struct request_queue *q = part_to_disk(p)->queue; unsigned int inflight[2]; + int cpu; + + if (precise_iostat) { + cpu = part_stat_lock(); + part_round_stats(q, cpu, p); + part_stat_unlock(); + } part_in_flight(q, p, inflight); return sprintf(buf, diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c index 00c7230acfed1de5d0d0330b49af9311747ab642..4bf4bb5b1a8bcf15c5a09df754fa2856942bda7a 100644 --- a/crypto/pcrypt.c +++ b/crypto/pcrypt.c @@ -509,11 +509,12 @@ static int __init pcrypt_init(void) static void __exit pcrypt_exit(void) { + crypto_unregister_template(&pcrypt_tmpl); + pcrypt_fini_padata(&pencrypt); pcrypt_fini_padata(&pdecrypt); kset_unregister(pcrypt_kset); - crypto_unregister_template(&pcrypt_tmpl); } module_init(pcrypt_init); diff --git a/drivers/acpi/acpica/exoparg1.c b/drivers/acpi/acpica/exoparg1.c index ba9fbae0cf91fe87fb454ff53bfcc25885ed7503..319f4bc6a83947b93f2fc13b54f781ee5b80ea8e 100644 --- a/drivers/acpi/acpica/exoparg1.c +++ b/drivers/acpi/acpica/exoparg1.c @@ -1007,7 +1007,8 @@ acpi_status acpi_ex_opcode_1A_0T_1R(struct acpi_walk_state *walk_state) (walk_state, return_desc, &temp_desc); if (ACPI_FAILURE(status)) { - goto cleanup; + return_ACPI_STATUS + (status); } return_desc = temp_desc; diff --git a/drivers/acpi/acpica/utdelete.c b/drivers/acpi/acpica/utdelete.c index 8cc4392c61f33aea636251426e263b7017613498..42e489cfa4d5bd8f9232c697a60c90df405527d9 100644 --- a/drivers/acpi/acpica/utdelete.c +++ b/drivers/acpi/acpica/utdelete.c @@ -410,6 +410,7 @@ acpi_ut_update_ref_count(union acpi_operand_object *object, u32 action) ACPI_WARNING((AE_INFO, "Obj %p, Reference Count is already zero, cannot decrement\n", object)); + return; } ACPI_DEBUG_PRINT_RAW((ACPI_DB_ALLOCATIONS, diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index bd35cfe6776d0e3ca2c8b077199bf2df4feca178..320db806994b27bd78ed2fb1a8155a1f2568a8d9 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -997,6 +997,11 @@ static int nc_dma_get_range(struct device *dev, u64 *size) ncomp = (struct acpi_iort_named_component *)node->node_data; + if (!ncomp->memory_address_limit) { + pr_warn(FW_BUG "Named component missing memory address limit\n"); + return -EINVAL; + } + *size = ncomp->memory_address_limit >= 64 ? U64_MAX : 1ULL<memory_address_limit; @@ -1016,6 +1021,11 @@ static int rc_dma_get_range(struct device *dev, u64 *size) rc = (struct acpi_iort_root_complex *)node->node_data; + if (!rc->memory_address_limit) { + pr_warn(FW_BUG "Root complex missing memory address limit\n"); + return -EINVAL; + } + *size = rc->memory_address_limit >= 64 ? U64_MAX : 1ULL<memory_address_limit; @@ -1072,8 +1082,8 @@ void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size) * retrieved from firmware. */ dev->bus_dma_mask = mask; - dev->coherent_dma_mask = mask; - *dev->dma_mask = mask; + dev->coherent_dma_mask = min(dev->coherent_dma_mask, mask); + *dev->dma_mask = min(*dev->dma_mask, mask); } *dma_addr = dmaaddr; diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 18c32637047f81d0569f79e8fca736cfee13e169..0654133efbe4be31e7264f2da4b775d18194a98d 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -483,6 +483,9 @@ enum binder_deferred_state { * @files files_struct for process * (protected by @files_lock) * @files_lock mutex to protect @files + * @cred struct cred associated with the `struct file` + * in binder_open() + * (invariant after initialized) * @deferred_work_node: element for binder_deferred_list * (protected by binder_deferred_lock) * @deferred_work: bitmap of deferred work to perform @@ -529,6 +532,7 @@ struct binder_proc { struct task_struct *tsk; struct files_struct *files; struct mutex files_lock; + const struct cred *cred; struct hlist_node deferred_work_node; int deferred_work; bool is_dead; @@ -2333,7 +2337,7 @@ static int binder_translate_binder(struct flat_binder_object *fp, ret = -EINVAL; goto done; } - if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) { + if (security_binder_transfer_binder(proc->cred, target_proc->cred)) { ret = -EPERM; goto done; } @@ -2379,7 +2383,7 @@ static int binder_translate_handle(struct flat_binder_object *fp, proc->pid, thread->pid, fp->handle); return -EINVAL; } - if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) { + if (security_binder_transfer_binder(proc->cred, target_proc->cred)) { ret = -EPERM; goto done; } @@ -2463,7 +2467,7 @@ static int binder_translate_fd(int fd, ret = -EBADF; goto err_fget; } - ret = security_binder_transfer_file(proc->tsk, target_proc->tsk, file); + ret = security_binder_transfer_file(proc->cred, target_proc->cred, file); if (ret < 0) { ret = -EPERM; goto err_security; @@ -2841,8 +2845,8 @@ static void binder_transaction(struct binder_proc *proc, goto err_dead_binder; } e->to_node = target_node->debug_id; - if (security_binder_transaction(proc->tsk, - target_proc->tsk) < 0) { + if (security_binder_transaction(proc->cred, + target_proc->cred) < 0) { return_error = BR_FAILED_REPLY; return_error_param = -EPERM; return_error_line = __LINE__; @@ -4328,6 +4332,7 @@ static void binder_free_proc(struct binder_proc *proc) BUG_ON(!list_empty(&proc->delivered_death)); binder_alloc_deferred_release(&proc->alloc); put_task_struct(proc->tsk); + put_cred(proc->cred); binder_stats_deleted(BINDER_STAT_PROC); kfree(proc); } @@ -4531,7 +4536,7 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp) ret = -EBUSY; goto out; } - ret = security_binder_set_context_mgr(proc->tsk); + ret = security_binder_set_context_mgr(proc->cred); if (ret < 0) goto out; if (uid_valid(context->binder_context_mgr_uid)) { @@ -4786,6 +4791,7 @@ static int binder_open(struct inode *nodp, struct file *filp) get_task_struct(current->group_leader); proc->tsk = current->group_leader; mutex_init(&proc->files_lock); + proc->cred = get_cred(filp->f_cred); INIT_LIST_HEAD(&proc->todo); proc->default_priority = task_nice(current); binder_dev = container_of(filp->private_data, struct binder_device, diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index a35189098581b3705ec3fca6003e527ad98504f8..c2e5ba599418d03789371205d2b53e163835262a 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1704,14 +1704,6 @@ static int nbd_dev_add(int index) int err = -ENOMEM; int first_minor = index << part_shift; - /* - * Too big index can cause duplicate creation of sysfs files/links, - * because MKDEV() expect that the max first minor is MINORMASK, or - * index << part_shift can overflow. - */ - if (first_minor < index || first_minor > MINORMASK) - return -EINVAL; - nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL); if (!nbd) goto out; @@ -1844,8 +1836,20 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; - if (info->attrs[NBD_ATTR_INDEX]) + if (info->attrs[NBD_ATTR_INDEX]) { index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); + + /* + * Too big first_minor can cause duplicate creation of + * sysfs files/links, since index << part_shift might + * overflow, or MKDEV() expect that the max bits of + * first_minor is 20. + */ + if (index < 0 || index > MINORMASK >> part_shift) { + printk(KERN_ERR "nbd: illegal input index %d\n", index); + return -EINVAL; + } + } if (!info->attrs[NBD_ATTR_SOCKETS]) { printk(KERN_ERR "nbd: must specify at least one socket\n"); return -EINVAL; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 4e52917ebd3115bb320cb599104b8a97b6fdaf3f..0e451b17f33a2378fd1b1d39c82be3061b682934 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1342,7 +1342,8 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo) rinfo->ring_ref[i] = GRANT_INVALID_REF; } } - free_pages((unsigned long)rinfo->ring.sring, get_order(info->nr_ring_pages * XEN_PAGE_SIZE)); + free_pages_exact(rinfo->ring.sring, + info->nr_ring_pages * XEN_PAGE_SIZE); rinfo->ring.sring = NULL; if (rinfo->irq) @@ -1426,9 +1427,15 @@ static int blkif_get_final_status(enum blk_req_status s1, return BLKIF_RSP_OKAY; } -static bool blkif_completion(unsigned long *id, - struct blkfront_ring_info *rinfo, - struct blkif_response *bret) +/* + * Return values: + * 1 response processed. + * 0 missing further responses. + * -1 error while processing. + */ +static int blkif_completion(unsigned long *id, + struct blkfront_ring_info *rinfo, + struct blkif_response *bret) { int i = 0; struct scatterlist *sg; @@ -1451,7 +1458,7 @@ static bool blkif_completion(unsigned long *id, /* Wait the second response if not yet here. */ if (s2->status < REQ_DONE) - return false; + return 0; bret->status = blkif_get_final_status(s->status, s2->status); @@ -1502,42 +1509,43 @@ static bool blkif_completion(unsigned long *id, } /* Add the persistent grant into the list of free grants */ for (i = 0; i < num_grant; i++) { - if (gnttab_query_foreign_access(s->grants_used[i]->gref)) { + if (!gnttab_try_end_foreign_access(s->grants_used[i]->gref)) { /* * If the grant is still mapped by the backend (the * backend has chosen to make this grant persistent) * we add it at the head of the list, so it will be * reused first. */ - if (!info->feature_persistent) - pr_alert_ratelimited("backed has not unmapped grant: %u\n", - s->grants_used[i]->gref); + if (!info->feature_persistent) { + pr_alert("backed has not unmapped grant: %u\n", + s->grants_used[i]->gref); + return -1; + } list_add(&s->grants_used[i]->node, &rinfo->grants); rinfo->persistent_gnts_c++; } else { /* - * If the grant is not mapped by the backend we end the - * foreign access and add it to the tail of the list, - * so it will not be picked again unless we run out of - * persistent grants. + * If the grant is not mapped by the backend we add it + * to the tail of the list, so it will not be picked + * again unless we run out of persistent grants. */ - gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL); s->grants_used[i]->gref = GRANT_INVALID_REF; list_add_tail(&s->grants_used[i]->node, &rinfo->grants); } } if (s->req.operation == BLKIF_OP_INDIRECT) { for (i = 0; i < INDIRECT_GREFS(num_grant); i++) { - if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) { - if (!info->feature_persistent) - pr_alert_ratelimited("backed has not unmapped grant: %u\n", - s->indirect_grants[i]->gref); + if (!gnttab_try_end_foreign_access(s->indirect_grants[i]->gref)) { + if (!info->feature_persistent) { + pr_alert("backed has not unmapped grant: %u\n", + s->indirect_grants[i]->gref); + return -1; + } list_add(&s->indirect_grants[i]->node, &rinfo->grants); rinfo->persistent_gnts_c++; } else { struct page *indirect_page; - gnttab_end_foreign_access(s->indirect_grants[i]->gref, 0, 0UL); /* * Add the used indirect page back to the list of * available pages for indirect grefs. @@ -1552,7 +1560,7 @@ static bool blkif_completion(unsigned long *id, } } - return true; + return 1; } static irqreturn_t blkif_interrupt(int irq, void *dev_id) @@ -1618,12 +1626,17 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) } if (bret.operation != BLKIF_OP_DISCARD) { + int ret; + /* * We may need to wait for an extra response if the * I/O request is split in 2 */ - if (!blkif_completion(&id, rinfo, &bret)) + ret = blkif_completion(&id, rinfo, &bret); + if (!ret) continue; + if (unlikely(ret < 0)) + goto err; } if (add_id_to_freelist(rinfo, id)) { @@ -1729,8 +1742,7 @@ static int setup_blkring(struct xenbus_device *dev, for (i = 0; i < info->nr_ring_pages; i++) rinfo->ring_ref[i] = GRANT_INVALID_REF; - sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH, - get_order(ring_size)); + sring = alloc_pages_exact(ring_size, GFP_NOIO); if (!sring) { xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); return -ENOMEM; @@ -1740,7 +1752,7 @@ static int setup_blkring(struct xenbus_device *dev, err = xenbus_grant_ring(dev, rinfo->ring.sring, info->nr_ring_pages, gref); if (err < 0) { - free_pages((unsigned long)sring, get_order(ring_size)); + free_pages_exact(sring, ring_size); rinfo->ring.sring = NULL; goto fail; } @@ -2719,11 +2731,10 @@ static void purge_persistent_grants(struct blkfront_info *info) list_for_each_entry_safe(gnt_list_entry, tmp, &rinfo->grants, node) { if (gnt_list_entry->gref == GRANT_INVALID_REF || - gnttab_query_foreign_access(gnt_list_entry->gref)) + !gnttab_try_end_foreign_access(gnt_list_entry->gref)) continue; list_del(&gnt_list_entry->node); - gnttab_end_foreign_access(gnt_list_entry->gref, 0, 0UL); rinfo->persistent_gnts_c--; gnt_list_entry->gref = GRANT_INVALID_REF; list_add_tail(&gnt_list_entry->node, &rinfo->grants); diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 505562acf275527857496637cf470b3759d3b0f4..d6bb0a4bde56fbd1f2eabacb96221ee5b6da6ea2 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -553,9 +553,9 @@ config ADI driver include crash and makedumpfile. config HISI_SVM - bool "Hisilicon svm driver" + tristate "Hisilicon svm driver" depends on ARM64 && ARM_SMMU_V3 && MMU_NOTIFIER && HUGETLBFS - default y + default m help This driver provides character-level access to Hisilicon SVM chipset. Typically, you can bind a task to the diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 272c34102875caf4a80b6cf8f0282ba33abc4a97..83eaab66d2f64f65d71adc4f5d9fa2bccb28a9cd 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -2863,7 +2863,7 @@ cleanup_bmc_device(struct kref *ref) * with removing the device attributes while reading a device * attribute. */ - schedule_work(&bmc->remove_work); + queue_work(remove_work_wq, &bmc->remove_work); } /* @@ -5083,22 +5083,27 @@ static int ipmi_init_msghandler(void) if (initialized) goto out; - init_srcu_struct(&ipmi_interfaces_srcu); - - timer_setup(&ipmi_timer, ipmi_timeout, 0); - mod_timer(&ipmi_timer, jiffies + IPMI_TIMEOUT_JIFFIES); - - atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + rv = init_srcu_struct(&ipmi_interfaces_srcu); + if (rv) + goto out; remove_work_wq = create_singlethread_workqueue("ipmi-msghandler-remove-wq"); if (!remove_work_wq) { pr_err("unable to create ipmi-msghandler-remove-wq workqueue"); rv = -ENOMEM; - goto out; + goto out_wq; } + timer_setup(&ipmi_timer, ipmi_timeout, 0); + mod_timer(&ipmi_timer, jiffies + IPMI_TIMEOUT_JIFFIES); + + atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + initialized = true; +out_wq: + if (rv) + cleanup_srcu_struct(&ipmi_interfaces_srcu); out: mutex_unlock(&ipmi_interfaces_mutex); return rv; diff --git a/drivers/char/svm.c b/drivers/char/svm.c index 098b84529d0e58ae33198564b3ca77ea4f096e54..9f98f57d562abfb649f9bdc2ed2d732754ad4cbe 100644 --- a/drivers/char/svm.c +++ b/drivers/char/svm.c @@ -43,8 +43,6 @@ #define SVM_IOCTL_LOAD_FLAG 0xfffa #define SVM_IOCTL_PIN_MEMORY 0xfff7 #define SVM_IOCTL_UNPIN_MEMORY 0xfff5 -#define SVM_IOCTL_GETHUGEINFO 0xfff6 -#define SVM_IOCTL_GET_PHYMEMINFO 0xfff8 #define SVM_IOCTL_REMAP_PROC 0xfff4 #define SVM_REMAP_MEM_LEN_MAX (16 * 1024 * 1024) @@ -123,23 +121,6 @@ struct svm_proc_mem { u64 buf; }; -struct meminfo { - unsigned long hugetlbfree; - unsigned long hugetlbtotal; -}; - -struct phymeminfo { - unsigned long normal_total; - unsigned long normal_free; - unsigned long huge_total; - unsigned long huge_free; -}; - -struct phymeminfo_ioctl { - struct phymeminfo *info; - unsigned long nodemask; -}; - struct spalloc { unsigned long addr; unsigned long size; @@ -169,10 +150,6 @@ static char *svm_cmd_to_string(unsigned int cmd) return "pin memory"; case SVM_IOCTL_UNPIN_MEMORY: return "unpin memory"; - case SVM_IOCTL_GETHUGEINFO: - return "get hugeinfo"; - case SVM_IOCTL_GET_PHYMEMINFO: - return "get physical memory info"; case SVM_IOCTL_REMAP_PROC: return "remap proc"; case SVM_IOCTL_LOAD_FLAG: @@ -186,8 +163,6 @@ static char *svm_cmd_to_string(unsigned int cmd) return NULL; } -extern void sysrq_sched_debug_tidy(void); - /* * image word of slot * SVM_IMAGE_WORD_INIT: initial value, indicating that the slot is not used. @@ -446,17 +421,6 @@ static int svm_va2pa_trunk_init(struct device *dev) return 0; } -void sysrq_sched_debug_show_export(void) -{ -#ifdef CONFIG_SCHED_DEBUG - sysrq_sched_debug_tidy(); -#else - pr_err("Not open CONFIG_SCHED_DEBUG\n"); -#endif - panic("pcie heart miss\n"); -} -EXPORT_SYMBOL(sysrq_sched_debug_show_export); - static struct svm_process *find_svm_process(unsigned long asid) { struct rb_node *node = svm_process_root.rb_node; @@ -536,7 +500,7 @@ static int svm_remove_core(struct device *dev, void *data) { struct core_device *cdev = to_core_device(dev); - if (!cdev->smmu_bypass) { + if (!cdev->smmu_bypass && cdev->group && cdev->domain) { iommu_sva_device_shutdown(dev); iommu_detach_group(cdev->domain, cdev->group); iommu_group_put(cdev->group); @@ -964,11 +928,7 @@ static struct task_struct *svm_get_task(struct svm_bind_process params) if (params.flags & SVM_BIND_PID) { struct mm_struct *mm = NULL; - rcu_read_lock(); - task = find_task_by_vpid(params.vpid); - if (task) - get_task_struct(task); - rcu_read_unlock(); + task = find_get_task_by_vpid(params.vpid); if (task == NULL) return ERR_PTR(-ESRCH); @@ -1075,6 +1035,7 @@ static int svm_acpi_add_core(struct svm_device *sdev, struct core_device *cdev = NULL; char *name = NULL; enum dev_dma_attr attr; + const union acpi_object *obj; name = devm_kasprintf(sdev->dev, GFP_KERNEL, "svm_child_dev%d", id); if (name == NULL) @@ -1099,7 +1060,7 @@ static int svm_acpi_add_core(struct svm_device *sdev, return err; } - attr = acpi_get_dma_attr(children); + attr = device_get_dma_attr(&cdev->dev); if (attr != DEV_DMA_NOT_SUPPORTED) { err = acpi_dma_configure(&cdev->dev, attr); if (err) { @@ -1108,12 +1069,14 @@ static int svm_acpi_add_core(struct svm_device *sdev, } } - err = acpi_dev_prop_read_single(children, "hisi,smmu-bypass", - DEV_PROP_U8, &cdev->smmu_bypass); + err = acpi_dev_get_property(children, "hisi,smmu-bypass", + DEV_PROP_U8, &obj); if (err) { dev_info(&children->dev, "read smmu bypass failed\n"); } + cdev->smmu_bypass = (u8)obj->integer.value; + cdev->group = iommu_group_get(&cdev->dev); if (IS_ERR_OR_NULL(cdev->group)) { dev_err(&cdev->dev, "smmu is not right configured\n"); @@ -1332,7 +1295,7 @@ static pte_t *svm_get_pte(struct vm_area_struct *vma, if (is_vm_hugetlb_page(vma)) { if (pud_present(*pud)) { - if (pud_huge(*pud)) { + if (pud_val(*pud) && !(pud_val(*pud) & PUD_TABLE_BIT)) { pte = (pte_t *)pud; *offset = addr & (PUD_SIZE - 1); size = PUD_SIZE; @@ -1383,11 +1346,11 @@ static pte_t *svm_walk_pt(unsigned long addr, unsigned long *page_size, return NULL; pgd = pgd_offset(mm, addr); - if (pgd_none_or_clear_bad(pgd)) + if (pgd_none(*pgd)) return NULL; pud = pud_offset(pgd, addr); - if (pud_none_or_clear_bad(pud)) + if (pud_none(*pud)) return NULL; return svm_get_pte(vma, pud, addr, page_size, offset); @@ -1562,95 +1525,6 @@ static int svm_set_rc(unsigned long __user *arg) return 0; } -static long svm_get_hugeinfo(unsigned long __user *arg) -{ - struct hstate *h = &default_hstate; - struct meminfo info; - - if (!acpi_disabled) - return -EPERM; - - if (arg == NULL) - return -EINVAL; - - if (!hugepages_supported()) - return -ENOTSUPP; - - info.hugetlbfree = h->free_huge_pages; - info.hugetlbtotal = h->nr_huge_pages; - - if (copy_to_user((void __user *)arg, &info, sizeof(info))) - return -EFAULT; - - pr_info("svm get hugetlb info: order(%u), max_huge_pages(%lu)," - "nr_huge_pages(%lu), free_huge_pages(%lu), resv_huge_pages(%lu)", - h->order, - h->max_huge_pages, - h->nr_huge_pages, - h->free_huge_pages, - h->resv_huge_pages); - - return 0; -} - -static void svm_get_node_memory_info_inc(unsigned long nid, struct phymeminfo *info) -{ - struct sysinfo i; - struct hstate *h = &default_hstate; - unsigned long huge_free = 0; - unsigned long huge_total = 0; - - if (hugepages_supported()) { - huge_free = h->free_huge_pages_node[nid] * (PAGE_SIZE << huge_page_order(h)); - huge_total = h->nr_huge_pages_node[nid] * (PAGE_SIZE << huge_page_order(h)); - } - -#ifdef CONFIG_NUMA - si_meminfo_node(&i, nid); -#else - si_meminfo(&i); -#endif - info->normal_free += i.freeram * PAGE_SIZE; - info->normal_total += i.totalram * PAGE_SIZE - huge_total; - info->huge_total += huge_total; - info->huge_free += huge_free; -} - -static void __svm_get_memory_info(unsigned long nodemask, struct phymeminfo *info) -{ - memset(info, 0x0, sizeof(struct phymeminfo)); - - nodemask = nodemask & ((1UL << MAX_NUMNODES) - 1); - - while (nodemask) { - unsigned long nid = find_first_bit(&nodemask, BITS_PER_LONG); - if (node_isset(nid, node_online_map)) { - (void)svm_get_node_memory_info_inc(nid, info); - } - - nodemask &= ~(1UL << nid); - } -} - -static long svm_get_phy_memory_info(unsigned long __user *arg) -{ - struct phymeminfo info; - struct phymeminfo_ioctl para; - - if (arg == NULL) - return -EINVAL; - - if (copy_from_user(¶, (void __user *)arg, sizeof(para))) - return -EFAULT; - - __svm_get_memory_info(para.nodemask, &info); - - if (copy_to_user((void __user *)para.info, &info, sizeof(info))) - return -EFAULT; - - return 0; -} - static long svm_remap_get_phys(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long *phys, unsigned long *page_size, unsigned long *offset) @@ -1665,11 +1539,11 @@ static long svm_remap_get_phys(struct mm_struct *mm, struct vm_area_struct *vma, return err; pgd = pgd_offset(mm, addr); - if (pgd_none_or_clear_bad(pgd)) + if (pgd_none(*pgd)) return err; pud = pud_offset(pgd, addr); - if (pud_none_or_clear_bad(pud)) + if (pud_none(*pud)) return err; pte = svm_get_pte(vma, pud, addr, page_size, offset); @@ -1710,20 +1584,19 @@ static long svm_remap_proc(unsigned long __user *arg) return -EINVAL; } - rcu_read_lock(); if (pmem.pid) { - ptask = find_task_by_vpid(pmem.pid); + ptask = find_get_task_by_vpid(pmem.pid); if (!ptask) { - rcu_read_unlock(); pr_err("No task for this pid\n"); return -EINVAL; } } else { + rcu_read_lock(); ptask = current; + get_task_struct(ptask); + rcu_read_unlock(); } - get_task_struct(ptask); - rcu_read_unlock(); pmm = ptask->mm; down_read(&mm->mmap_sem); @@ -1810,65 +1683,6 @@ static int svm_proc_load_flag(int __user *arg) return put_user(flag, arg); } -static unsigned long svm_get_unmapped_area(struct file *file, - unsigned long addr0, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - unsigned long addr = addr0; - struct mm_struct *mm = current->mm; - struct vm_unmapped_area_info info; - struct svm_device *sdev = file_to_sdev(file); - - if (!acpi_disabled) - return -EPERM; - - if (flags & MAP_FIXED) { - if (IS_ALIGNED(addr, len)) - return addr; - - dev_err(sdev->dev, "MAP_FIXED but not aligned\n"); - return -EINVAL; //lint !e570 - } - - if (addr) { - struct vm_area_struct *vma = NULL; - - addr = ALIGN(addr, len); - - if (dvpp_mmap_check(addr, len, flags)) - return -ENOMEM; - - vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && - (vma == NULL || addr + len <= vm_start_gap(vma))) - return addr; - } - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = ((mm->mmap_base <= DVPP_MMAP_BASE) ? - mm->mmap_base : DVPP_MMAP_BASE); - info.align_mask = ((len >> PAGE_SHIFT) - 1) << PAGE_SHIFT; - info.align_offset = pgoff << PAGE_SHIFT; - - addr = vm_unmapped_area(&info); - - if (offset_in_page(addr)) { - VM_BUG_ON(addr != -ENOMEM); - info.flags = 0; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = DVPP_MMAP_BASE; - - if (enable_mmap_dvpp) - dvpp_mmap_get_area(&info, flags); - - addr = vm_unmapped_area(&info); - } - - return addr; -} - static int svm_mmap(struct file *file, struct vm_area_struct *vma) { int err; @@ -2092,12 +1906,6 @@ static long svm_ioctl(struct file *file, unsigned int cmd, case SVM_IOCTL_UNPIN_MEMORY: err = svm_unpin_memory((unsigned long __user *)arg); break; - case SVM_IOCTL_GETHUGEINFO: - err = svm_get_hugeinfo((unsigned long __user *)arg); - break; - case SVM_IOCTL_GET_PHYMEMINFO: - err = svm_get_phy_memory_info((unsigned long __user *)arg); - break; case SVM_IOCTL_REMAP_PROC: err = svm_remap_proc((unsigned long __user *)arg); break; @@ -2128,7 +1936,6 @@ static const struct file_operations svm_fops = { .owner = THIS_MODULE, .open = svm_open, .mmap = svm_mmap, - .get_unmapped_area = svm_get_unmapped_area, .unlocked_ioctl = svm_ioctl, }; diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c index 0aade05f47fa6cf2c6e17ddd259f181b24c5f1ac..a8e8289e0b358dfdda8c15d3a3aa6c69834d3a2d 100644 --- a/drivers/char/tpm/tpm_tis_core.c +++ b/drivers/char/tpm/tpm_tis_core.c @@ -918,7 +918,15 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq, intmask |= TPM_INTF_CMD_READY_INT | TPM_INTF_LOCALITY_CHANGE_INT | TPM_INTF_DATA_AVAIL_INT | TPM_INTF_STS_VALID_INT; intmask &= ~TPM_GLOBAL_INT_ENABLE; + + rc = request_locality(chip, 0); + if (rc < 0) { + rc = -ENODEV; + goto out_err; + } + tpm_tis_write32(priv, TPM_INT_ENABLE(priv->locality), intmask); + release_locality(chip, 0); rc = tpm2_probe(chip); if (rc) diff --git a/drivers/crypto/qce/sha.c b/drivers/crypto/qce/sha.c index d8a5db11b7ea1f3b3fec471b2cca1e11a84d6d1e..bffd4d15145d95841c006535c216a24e5383cb6e 100644 --- a/drivers/crypto/qce/sha.c +++ b/drivers/crypto/qce/sha.c @@ -521,8 +521,8 @@ static int qce_ahash_register_one(const struct qce_ahash_def *def, ret = crypto_register_ahash(alg); if (ret) { - kfree(tmpl); dev_err(qce->dev, "%s registration failed\n", base->cra_name); + kfree(tmpl); return ret; } diff --git a/drivers/crypto/stm32/stm32-cryp.c b/drivers/crypto/stm32/stm32-cryp.c index 23b0b7bd64c7f8388e0e641f86cd7ecbdd95b40e..b3b49dce1136955e84af78c824f576a28452ef89 100644 --- a/drivers/crypto/stm32/stm32-cryp.c +++ b/drivers/crypto/stm32/stm32-cryp.c @@ -2036,8 +2036,6 @@ static int stm32_cryp_probe(struct platform_device *pdev) list_del(&cryp->list); spin_unlock(&cryp_list.lock); - pm_runtime_disable(dev); - pm_runtime_put_noidle(dev); pm_runtime_disable(dev); pm_runtime_put_noidle(dev); diff --git a/drivers/crypto/stm32/stm32_crc32.c b/drivers/crypto/stm32/stm32_crc32.c index 29d2095d9dfda8eccb40eafb097fdaf7baeb7b35..48c4a71d1cb328ba00538bef357cd53317f216b3 100644 --- a/drivers/crypto/stm32/stm32_crc32.c +++ b/drivers/crypto/stm32/stm32_crc32.c @@ -217,7 +217,7 @@ static struct shash_alg algs[] = { .digestsize = CHKSUM_DIGEST_SIZE, .base = { .cra_name = "crc32", - .cra_driver_name = DRIVER_NAME, + .cra_driver_name = "stm32-crc32-crc32", .cra_priority = 200, .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, .cra_blocksize = CHKSUM_BLOCK_SIZE, @@ -239,7 +239,7 @@ static struct shash_alg algs[] = { .digestsize = CHKSUM_DIGEST_SIZE, .base = { .cra_name = "crc32c", - .cra_driver_name = DRIVER_NAME, + .cra_driver_name = "stm32-crc32-crc32c", .cra_priority = 200, .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, .cra_blocksize = CHKSUM_BLOCK_SIZE, diff --git a/drivers/dma/mmp_pdma.c b/drivers/dma/mmp_pdma.c index eb3a1f42ab065793fbd4c30197b7cbc2300ec7f7..e8b2d3e31de802b4e5d321383d01994298e76012 100644 --- a/drivers/dma/mmp_pdma.c +++ b/drivers/dma/mmp_pdma.c @@ -722,12 +722,6 @@ static int mmp_pdma_config(struct dma_chan *dchan, chan->dir = cfg->direction; chan->dev_addr = addr; - /* FIXME: drivers should be ported over to use the filter - * function. Once that's done, the following two lines can - * be removed. - */ - if (cfg->slave_id) - chan->drcmr = cfg->slave_id; return 0; } diff --git a/drivers/dma/pxa_dma.c b/drivers/dma/pxa_dma.c index b31c28b67ad3ec6aef3babf5402defaac504f680..c54986902b9d28f8378ccc65fe6b01ee5937e6ae 100644 --- a/drivers/dma/pxa_dma.c +++ b/drivers/dma/pxa_dma.c @@ -960,13 +960,6 @@ static void pxad_get_config(struct pxad_chan *chan, *dcmd |= PXA_DCMD_BURST16; else if (maxburst == 32) *dcmd |= PXA_DCMD_BURST32; - - /* FIXME: drivers should be ported over to use the filter - * function. Once that's done, the following two lines can - * be removed. - */ - if (chan->cfg.slave_id) - chan->drcmr = chan->cfg.slave_id; } static struct dma_async_tx_descriptor * diff --git a/drivers/dma/sh/rcar-dmac.c b/drivers/dma/sh/rcar-dmac.c index 80ff95f75199f0ce30f733ebba6a727a7921a31b..29c51762336d7ed31acaa3c20fcc61de8340a863 100644 --- a/drivers/dma/sh/rcar-dmac.c +++ b/drivers/dma/sh/rcar-dmac.c @@ -1817,7 +1817,9 @@ static int rcar_dmac_probe(struct platform_device *pdev) platform_set_drvdata(pdev, dmac); dmac->dev->dma_parms = &dmac->parms; dma_set_max_seg_size(dmac->dev, RCAR_DMATCR_MASK); - dma_set_mask_and_coherent(dmac->dev, DMA_BIT_MASK(40)); + ret = dma_set_mask_and_coherent(dmac->dev, DMA_BIT_MASK(40)); + if (ret) + return ret; ret = rcar_dmac_parse_of(&pdev->dev, dmac); if (ret < 0) diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c index 4709f08f39e490d2b755b4a6b5bf5dc072635734..02f5188379784ad32e0a348bccbc074e9ab8d29e 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.c +++ b/drivers/gpu/drm/vgem/vgem_drv.c @@ -189,9 +189,10 @@ static struct drm_gem_object *vgem_gem_create(struct drm_device *dev, return ERR_CAST(obj); ret = drm_gem_handle_create(file, &obj->base, handle); - drm_gem_object_put_unlocked(&obj->base); - if (ret) + if (ret) { + drm_gem_object_put_unlocked(&obj->base); return ERR_PTR(ret); + } return &obj->base; } @@ -214,7 +215,9 @@ static int vgem_gem_dumb_create(struct drm_file *file, struct drm_device *dev, args->size = gem_object->size; args->pitch = pitch; - DRM_DEBUG_DRIVER("Created object of size %lld\n", size); + drm_gem_object_put_unlocked(gem_object); + + DRM_DEBUG_DRIVER("Created object of size %llu\n", args->size); return 0; } diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 1e9a5da562f0d7dd28b2623ca3ec2e6e3900811c..29bf91e50d4131f562990d6ce21c222cbc8f578a 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -769,6 +770,7 @@ static int iommu_ga_log_enable(struct amd_iommu *iommu) status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); if (status & (MMIO_STATUS_GALOG_RUN_MASK)) break; + udelay(10); } if (i >= LOOP_TIMEOUT) diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index 967450bd421a171c97adec77faf50c92d46a5c67..27600545191428895eb469b5a7d5d8902473a4b2 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -539,7 +539,7 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) irq_domain_free_fwnode(fn); if (!iommu->ir_domain) { pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id); - goto out_free_bitmap; + goto out_free_fwnode; } iommu->ir_msi_domain = arch_create_remap_msi_irq_domain(iommu->ir_domain, @@ -563,7 +563,7 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) if (dmar_enable_qi(iommu)) { pr_err("Failed to enable queued invalidation\n"); - goto out_free_bitmap; + goto out_free_ir_domain; } } @@ -587,6 +587,14 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) return 0; +out_free_ir_domain: + if (iommu->ir_msi_domain) + irq_domain_remove(iommu->ir_msi_domain); + iommu->ir_msi_domain = NULL; + irq_domain_remove(iommu->ir_domain); + iommu->ir_domain = NULL; +out_free_fwnode: + irq_domain_free_fwnode(fn); out_free_bitmap: kfree(bitmap); out_free_pages: diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index efd6e994678f9974068c53af12677e71d6368f67..3f99077b3611a7567d66e6ed42f3ad5ef8821d7e 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -299,11 +299,12 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data, static arm_lpae_iopte arm_lpae_install_table(arm_lpae_iopte *table, arm_lpae_iopte *ptep, arm_lpae_iopte curr, - struct io_pgtable_cfg *cfg) + struct arm_lpae_io_pgtable *data) { arm_lpae_iopte old, new; + struct io_pgtable_cfg *cfg = &data->iop.cfg; - new = __pa(table) | ARM_LPAE_PTE_TYPE_TABLE; + new = paddr_to_iopte(__pa(table), data) | ARM_LPAE_PTE_TYPE_TABLE; if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS) new |= ARM_LPAE_PTE_NSTABLE; @@ -354,7 +355,7 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, if (!cptep) return -ENOMEM; - pte = arm_lpae_install_table(cptep, ptep, 0, cfg); + pte = arm_lpae_install_table(cptep, ptep, 0, data); if (pte) __arm_lpae_free_pages(cptep, tblsz, cfg); } else if (!cfg->coherent_walk && !(pte & ARM_LPAE_PTE_SW_SYNC)) { @@ -513,7 +514,7 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, __arm_lpae_init_pte(data, blk_paddr, pte, lvl, &tablep[i]); } - pte = arm_lpae_install_table(tablep, ptep, blk_pte, cfg); + pte = arm_lpae_install_table(tablep, ptep, blk_pte, data); if (pte != blk_pte) { __arm_lpae_free_pages(tablep, tablesz, cfg); /* diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 63bd5940b2901bde776af6dc33998424c99587f3..84a59762b037ab30b383b8ec48de9fa17d4d75e1 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2287,6 +2287,7 @@ int iommu_request_dm_for_dev(struct device *dev) return ret; } +EXPORT_SYMBOL_GPL(iommu_request_dm_for_dev); const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode) { diff --git a/drivers/irqchip/irq-gic-phytium-2500-its.c b/drivers/irqchip/irq-gic-phytium-2500-its.c index fff1c8546d230d9ad44e4e1fd10d60703fca056d..dd24af3793ca5f0a744f5559f28c0a00aa863468 100644 --- a/drivers/irqchip/irq-gic-phytium-2500-its.c +++ b/drivers/irqchip/irq-gic-phytium-2500-its.c @@ -1181,8 +1181,7 @@ static int its_cpumask_select(struct its_device *its_dev, } cpu = cpumask_any_and(mask_val, cpu_mask); - if ((cpu > cpus) && (cpu < (cpus + skt_cpu_cnt[skt_id]))) - cpus = cpu; + cpus = cpus + cpu % skt_cpu_cnt[skt_id]; if (is_kdump_kernel()) { skt = (cpu_logical_map(cpu) >> 16) & 0xff; diff --git a/drivers/irqchip/irq-gic-phytium-2500.c b/drivers/irqchip/irq-gic-phytium-2500.c index 8674463a08c60f44c0d7315a4cfa3f82ba2e7db6..103e97f5855e689f2e8ed1518c81116f9ce896af 100644 --- a/drivers/irqchip/irq-gic-phytium-2500.c +++ b/drivers/irqchip/irq-gic-phytium-2500.c @@ -1123,8 +1123,7 @@ static int gic_cpumask_select(struct irq_data *d, const struct cpumask *mask_val } cpu = cpumask_any_and(mask_val, cpu_online_mask); - if ((cpu > cpus) && (cpu < (cpus + skt_cpu_cnt[irq_skt]))) - cpus = cpu; + cpus = cpus + cpu % skt_cpu_cnt[irq_skt]; if (is_kdump_kernel()) { skt = (cpu_logical_map(cpu) >> 16) & 0xff; diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 80683f2b723d531cc1cbc0f6fcd6151275cbf862..3bd805f7ce85bffacbd07ffd7c4c65a15a4f901e 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -410,7 +410,7 @@ static blk_status_t dm_dispatch_clone_request(struct request *clone, struct requ clone->rq_flags |= RQF_IO_STAT; clone->start_time_ns = ktime_get_ns(); - r = blk_insert_cloned_request(clone->q, clone); + r = __blk_insert_cloned_request(clone->q, clone, true); if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != BLK_STS_DEV_RESOURCE) /* must complete clone in terms of original request */ dm_complete_request(rq, r); diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 8aae0624a2971e939fa79acd656f98035049460d..6383afb88f319b813d8df26082aafa9df29dd93c 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -83,14 +83,16 @@ void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, } static int insert_at(size_t value_size, struct btree_node *node, unsigned index, - uint64_t key, void *value) - __dm_written_to_disk(value) + uint64_t key, void *value) + __dm_written_to_disk(value) { uint32_t nr_entries = le32_to_cpu(node->header.nr_entries); + uint32_t max_entries = le32_to_cpu(node->header.max_entries); __le64 key_le = cpu_to_le64(key); if (index > nr_entries || - index >= le32_to_cpu(node->header.max_entries)) { + index >= max_entries || + nr_entries >= max_entries) { DMERR("too many entries in btree node for insert"); __dm_unbless_for_disk(value); return -ENOMEM; diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index a284762e548e1dc840cef28df8f2167220b26051..5115a27196038dbb618444b1c82565b34fc79a4c 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -279,6 +279,11 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result) struct disk_index_entry ie_disk; struct dm_block *blk; + if (b >= ll->nr_blocks) { + DMERR_LIMIT("metadata block out of bounds"); + return -EINVAL; + } + b = do_div(index, ll->entries_per_block); r = ll->load_ie(ll, index, &ie_disk); if (r < 0) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 60eac66dc9f0b3920cad51af7a770cf9cb5dd2bf..890a21bc3e28bbe56568532e210277e47e6411ad 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -1658,31 +1658,31 @@ static void mmc_blk_read_single(struct mmc_queue *mq, struct request *req) struct mmc_card *card = mq->card; struct mmc_host *host = card->host; blk_status_t error = BLK_STS_OK; - int retries = 0; do { u32 status; int err; + int retries = 0; - mmc_blk_rw_rq_prep(mqrq, card, 1, mq); + while (retries++ <= MMC_READ_SINGLE_RETRIES) { + mmc_blk_rw_rq_prep(mqrq, card, 1, mq); - mmc_wait_for_req(host, mrq); + mmc_wait_for_req(host, mrq); - err = mmc_send_status(card, &status); - if (err) - goto error_exit; - - if (!mmc_host_is_spi(host) && - !mmc_blk_in_tran_state(status)) { - err = mmc_blk_fix_state(card, req); + err = mmc_send_status(card, &status); if (err) goto error_exit; - } - if (mrq->cmd->error && retries++ < MMC_READ_SINGLE_RETRIES) - continue; + if (!mmc_host_is_spi(host) && + !mmc_blk_in_tran_state(status)) { + err = mmc_blk_fix_state(card, req); + if (err) + goto error_exit; + } - retries = 0; + if (!mrq->cmd->error) + break; + } if (mrq->cmd->error || mrq->data->error || diff --git a/drivers/mtd/nand/bbt.c b/drivers/mtd/nand/bbt.c index 044adf91385465ca73e54f53238fb68b03c8a604..64af6898131d656401a42135855c22e183df4109 100644 --- a/drivers/mtd/nand/bbt.c +++ b/drivers/mtd/nand/bbt.c @@ -123,7 +123,7 @@ int nanddev_bbt_set_block_status(struct nand_device *nand, unsigned int entry, unsigned int rbits = bits_per_block + offs - BITS_PER_LONG; pos[1] &= ~GENMASK(rbits - 1, 0); - pos[1] |= val >> rbits; + pos[1] |= val >> (bits_per_block - rbits); } return 0; diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c index 4b90d5b380c2503fc88426890f719bd018d6d2f7..6fbca4ae1113f5e9c0ea984bf7cd30bcfdf87d4b 100644 --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c @@ -1633,7 +1633,7 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip, mtd->oobsize / trans, host->hwcfg.sector_size_1k); - if (!ret) { + if (ret != -EBADMSG) { *err_addr = brcmnand_read_reg(ctrl, BRCMNAND_UNCORR_ADDR) | ((u64)(brcmnand_read_reg(ctrl, diff --git a/drivers/mtd/nand/raw/qcom_nandc.c b/drivers/mtd/nand/raw/qcom_nandc.c index 07d8750313fd6fe44659367061489b0a273d3f8a..0e4be6814fc7c61322cb65c3235131fb7f13da71 100644 --- a/drivers/mtd/nand/raw/qcom_nandc.c +++ b/drivers/mtd/nand/raw/qcom_nandc.c @@ -10,7 +10,6 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ - #include #include #include @@ -2957,10 +2956,6 @@ static int qcom_nandc_probe(struct platform_device *pdev) if (!nandc->base_dma) return -ENXIO; - ret = qcom_nandc_alloc(nandc); - if (ret) - goto err_nandc_alloc; - ret = clk_prepare_enable(nandc->core_clk); if (ret) goto err_core_clk; @@ -2969,6 +2964,10 @@ static int qcom_nandc_probe(struct platform_device *pdev) if (ret) goto err_aon_clk; + ret = qcom_nandc_alloc(nandc); + if (ret) + goto err_nandc_alloc; + ret = qcom_nandc_setup(nandc); if (ret) goto err_setup; @@ -2980,15 +2979,14 @@ static int qcom_nandc_probe(struct platform_device *pdev) return 0; err_setup: + qcom_nandc_unalloc(nandc); +err_nandc_alloc: clk_disable_unprepare(nandc->aon_clk); err_aon_clk: clk_disable_unprepare(nandc->core_clk); err_core_clk: - qcom_nandc_unalloc(nandc); -err_nandc_alloc: dma_unmap_resource(dev, res->start, resource_size(res), DMA_BIDIRECTIONAL, 0); - return ret; } diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 93dfcef8afc4bc63a2e852b464e6482d3359e53c..e3f814e83d9c83f7630001d573f1019e8d2a5b6b 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -249,7 +249,7 @@ static inline int __check_agg_selection_timer(struct port *port) if (bond == NULL) return 0; - return BOND_AD_INFO(bond).agg_select_timer ? 1 : 0; + return atomic_read(&BOND_AD_INFO(bond).agg_select_timer) ? 1 : 0; } /** @@ -1012,8 +1012,8 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) if (port->aggregator && port->aggregator->is_active && !__port_is_enabled(port)) { - __enable_port(port); + *update_slave_arr = true; } } break; @@ -1760,6 +1760,7 @@ static void ad_agg_selection_logic(struct aggregator *agg, port = port->next_port_in_aggregator) { __enable_port(port); } + *update_slave_arr = true; } } @@ -1964,7 +1965,7 @@ static void ad_marker_response_received(struct bond_marker *marker, */ void bond_3ad_initiate_agg_selection(struct bonding *bond, int timeout) { - BOND_AD_INFO(bond).agg_select_timer = timeout; + atomic_set(&BOND_AD_INFO(bond).agg_select_timer, timeout); } /** @@ -2248,6 +2249,28 @@ void bond_3ad_update_ad_actor_settings(struct bonding *bond) spin_unlock_bh(&bond->mode_lock); } +/** + * bond_agg_timer_advance - advance agg_select_timer + * @bond: bonding structure + * + * Return true when agg_select_timer reaches 0. + */ +static bool bond_agg_timer_advance(struct bonding *bond) +{ + int val, nval; + + while (1) { + val = atomic_read(&BOND_AD_INFO(bond).agg_select_timer); + if (!val) + return false; + nval = val - 1; + if (atomic_cmpxchg(&BOND_AD_INFO(bond).agg_select_timer, + val, nval) == val) + break; + } + return nval == 0; +} + /** * bond_3ad_state_machine_handler - handle state machines timeout * @bond: bonding struct to work on @@ -2283,9 +2306,7 @@ void bond_3ad_state_machine_handler(struct work_struct *work) if (!bond_has_slaves(bond)) goto re_arm; - /* check if agg_select_timer timer after initialize is timed out */ - if (BOND_AD_INFO(bond).agg_select_timer && - !(--BOND_AD_INFO(bond).agg_select_timer)) { + if (bond_agg_timer_advance(bond)) { slave = bond_first_slave_rcu(bond); port = slave ? &(SLAVE_AD_INFO(slave)->port) : NULL; diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index c952ab169e4e01ea093472e779f7d4959ac26fc1..f4d40e983e9924b46b7e511cb62913359860329d 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -782,9 +782,6 @@ static bool bond_should_notify_peers(struct bonding *bond) slave = rcu_dereference(bond->curr_active_slave); rcu_read_unlock(); - netdev_dbg(bond->dev, "bond_should_notify_peers: slave %s\n", - slave ? slave->dev->name : "NULL"); - if (!slave || !bond->send_peer_notif || bond->send_peer_notif % max(1, bond->params.peer_notif_delay) != 0 || @@ -792,6 +789,9 @@ static bool bond_should_notify_peers(struct bonding *bond) test_bit(__LINK_STATE_LINKWATCH_PENDING, &slave->dev->state)) return false; + netdev_dbg(bond->dev, "bond_should_notify_peers: slave %s\n", + slave ? slave->dev->name : "NULL"); + return true; } diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 7e07e213f4c2fc0395f31461a06ca076516ec509..1745cba04c18014782226b9825cedb2dd971209a 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -1458,7 +1458,7 @@ static int bond_option_ad_actor_system_set(struct bonding *bond, mac = (u8 *)&newval->value; } - if (!is_valid_ether_addr(mac)) + if (is_multicast_ether_addr(mac)) goto err; netdev_dbg(bond->dev, "Setting ad_actor_system to %pM\n", mac); diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c index b7dfd4109d24ef3db5c03d78fb078f7f7cf6e3ab..7a92f640c3796b4852945b54aff2aa98a269b228 100644 --- a/drivers/net/can/usb/ems_usb.c +++ b/drivers/net/can/usb/ems_usb.c @@ -823,7 +823,6 @@ static netdev_tx_t ems_usb_start_xmit(struct sk_buff *skb, struct net_device *ne usb_unanchor_urb(urb); usb_free_coherent(dev->udev, size, buf, urb->transfer_dma); - dev_kfree_skb(skb); atomic_dec(&dev->active_tx_urbs); diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c index 1b0afeaf1a3c22ee3911635f0e2d2a464b51ed7f..25fcec5125ed1050b8fcbec20b3e9764090bfdba 100644 --- a/drivers/net/can/usb/mcba_usb.c +++ b/drivers/net/can/usb/mcba_usb.c @@ -377,7 +377,6 @@ static netdev_tx_t mcba_usb_start_xmit(struct sk_buff *skb, xmit_failed: can_free_echo_skb(priv->netdev, ctx->ndx); mcba_usb_free_ctx(ctx); - dev_kfree_skb(skb); stats->tx_dropped++; return NETDEV_TX_OK; diff --git a/drivers/net/can/usb/usb_8dev.c b/drivers/net/can/usb/usb_8dev.c index 3e44164736079d467644b63780002bbade28f933..9cd67262a5edee81885dc58290ccf5903267fa31 100644 --- a/drivers/net/can/usb/usb_8dev.c +++ b/drivers/net/can/usb/usb_8dev.c @@ -680,9 +680,20 @@ static netdev_tx_t usb_8dev_start_xmit(struct sk_buff *skb, atomic_inc(&priv->active_tx_urbs); err = usb_submit_urb(urb, GFP_ATOMIC); - if (unlikely(err)) - goto failed; - else if (atomic_read(&priv->active_tx_urbs) >= MAX_TX_URBS) + if (unlikely(err)) { + can_free_echo_skb(netdev, context->echo_index); + + usb_unanchor_urb(urb); + usb_free_coherent(priv->udev, size, buf, urb->transfer_dma); + + atomic_dec(&priv->active_tx_urbs); + + if (err == -ENODEV) + netif_device_detach(netdev); + else + netdev_warn(netdev, "failed tx_urb %d\n", err); + stats->tx_dropped++; + } else if (atomic_read(&priv->active_tx_urbs) >= MAX_TX_URBS) /* Slow down tx path */ netif_stop_queue(netdev); @@ -701,19 +712,6 @@ static netdev_tx_t usb_8dev_start_xmit(struct sk_buff *skb, return NETDEV_TX_BUSY; -failed: - can_free_echo_skb(netdev, context->echo_index); - - usb_unanchor_urb(urb); - usb_free_coherent(priv->udev, size, buf, urb->transfer_dma); - - atomic_dec(&priv->active_tx_urbs); - - if (err == -ENODEV) - netif_device_detach(netdev); - else - netdev_warn(netdev, "failed tx_urb %d\n", err); - nomembuf: usb_free_urb(urb); diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index aebf0d272251550ac38dc0fbe404ee10f07daba9..f8131e4e343a7a25d8392551eeb46a6da571586f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -32,7 +32,7 @@ #include #include -#define HNAE3_MOD_VERSION "21.12.4" +#define HNAE3_MOD_VERSION "22.2.1" #define HNAE3_MIN_VECTOR_NUM 2 /* first one for misc, another for IO */ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_cae/hns3_cae_version.h b/drivers/net/ethernet/hisilicon/hns3/hns3_cae/hns3_cae_version.h index f3a905252deb8b57be685cbe053cf1ed14ed192f..a52b964d953ca31e26be1c29bbc2ad3765ac19f2 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_cae/hns3_cae_version.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_cae/hns3_cae_version.h @@ -4,7 +4,7 @@ #ifndef __HNS3_CAE_VERSION_H__ #define __HNS3_CAE_VERSION_H__ -#define HNS3_CAE_MOD_VERSION "21.12.4" +#define HNS3_CAE_MOD_VERSION "22.2.1" #define CMT_ID_LEN 8 #define RESV_LEN 3 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index da140a3d568d450c252cc1da8663a8c81ac5b644..df7b9d3434ad562cb76c676d0033ae341114cd97 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -8,7 +8,7 @@ #include "hnae3.h" -#define HNS3_MOD_VERSION "21.12.4" +#define HNS3_MOD_VERSION "22.2.1" extern char hns3_driver_version[]; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index ef870da7a8a4b7defbd24de5de263ce5d0640eb2..be771c75e37e3ffa49b9ed952e28274a8a6eea8f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -395,8 +395,7 @@ static void hns3_self_test(struct net_device *ndev, #if IS_ENABLED(CONFIG_VLAN_8021Q) if (dis_vlan_filter) - h->ae_algo->ops->enable_vlan_filter(h, - ndev->flags & IFF_PROMISC); + h->ae_algo->ops->enable_vlan_filter(h, true); #endif if (if_running) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index b4c8f410e6f89884d34888999e38c7c84c984d4e..ca11b49c93a27a99195f2ae9bf4238ee7c08270a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -8324,16 +8324,30 @@ static int hclge_set_vlan_filter_ctrl(struct hclge_dev *hdev, u8 vlan_type, struct hclge_desc desc; int ret; - hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_VLAN_FILTER_CTRL, false); + /* read current vlan filter parameter */ + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_VLAN_FILTER_CTRL, true); req = (struct hclge_vlan_filter_ctrl_cmd *)desc.data; req->vlan_type = vlan_type; - req->vlan_fe = filter_en ? fe_type : 0; req->vf_id = vf_id; + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to get vport%u vlan filter config, ret = %d.\n", + vf_id, ret); + return ret; + } + + /* modify and write new config parameter */ + hclge_cmd_reuse_desc(&desc, false); + req->vlan_fe = filter_en ? + (req->vlan_fe | fe_type) : (req->vlan_fe & ~fe_type); + ret = hclge_cmd_send(&hdev->hw, &desc, 1); if (ret) dev_err(&hdev->pdev->dev, - "set vlan filter fail, ret =%d.\n", ret); + "failed to set vport%u vlan filter, ret = %d.\n", + vf_id, ret); return ret; } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 81451ccb4140c01de8523fe64d92f327bdab99da..027d9137f76de477b36dded5120b4bbe1e9ea6cf 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -12,7 +12,7 @@ #include "hclge_cmd.h" #include "hnae3.h" -#define HCLGE_MOD_VERSION "21.12.4" +#define HCLGE_MOD_VERSION "22.2.1" #define HCLGE_DRIVER_NAME "hclge" #define HCLGE_MAX_PF_NUM 8 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h index 6880c4cb52f3b4b3bc86316c1539a49c82b9e2ba..b2fd2a1546254c02de420470025cb73fe7d58db1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h @@ -10,7 +10,7 @@ #include "hclgevf_cmd.h" #include "hnae3.h" -#define HCLGEVF_MOD_VERSION "21.12.4" +#define HCLGEVF_MOD_VERSION "22.2.1" #define HCLGEVF_DRIVER_NAME "hclgevf" #define HCLGEVF_MAX_VLAN_ID 4095 diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index 4fc906c6166b34c790ebe60dc228d52756cf7eb4..074e23b4534b718f6af1df1f0d2169e447cadcbd 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -179,11 +179,14 @@ struct vf_data_storage { u16 pf_vlan; /* When set, guest VLAN config not allowed. */ u16 pf_qos; u16 tx_rate; + int link_enable; + int link_state; u8 spoofchk_enabled; bool rss_query_enabled; u8 trusted; int xcast_mode; unsigned int vf_api; + u8 primary_abort_count; }; enum ixgbevf_xcast_modes { @@ -546,6 +549,8 @@ struct ixgbe_mac_addr { #define IXGBE_TRY_LINK_TIMEOUT (4 * HZ) #define IXGBE_SFP_POLL_JIFFIES (2 * HZ) /* SFP poll every 2 seconds */ +#define IXGBE_PRIMARY_ABORT_LIMIT 5 + /* board specific private data structure */ struct ixgbe_adapter { unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)]; @@ -605,6 +610,7 @@ struct ixgbe_adapter { #define IXGBE_FLAG2_EEE_ENABLED BIT(15) #define IXGBE_FLAG2_RX_LEGACY BIT(16) #define IXGBE_FLAG2_IPSEC_ENABLED BIT(17) +#define IXGBE_FLAG2_AUTO_DISABLE_VF BIT(19) /* Tx fast path data */ int num_tx_queues; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index 8829bd95d0d36557acf0e669f77f6680e45485c5..73e769212c65b9f6e22c9566dc0110d1cc4d93d6 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -136,6 +136,8 @@ static const char ixgbe_gstrings_test[][ETH_GSTRING_LEN] = { static const char ixgbe_priv_flags_strings[][ETH_GSTRING_LEN] = { #define IXGBE_PRIV_FLAGS_LEGACY_RX BIT(0) "legacy-rx", +#define IXGBE_PRIV_FLAGS_AUTO_DISABLE_VF BIT(2) + "mdd-disable-vf", }; #define IXGBE_PRIV_FLAGS_STR_LEN ARRAY_SIZE(ixgbe_priv_flags_strings) @@ -3410,6 +3412,9 @@ static u32 ixgbe_get_priv_flags(struct net_device *netdev) if (adapter->flags2 & IXGBE_FLAG2_RX_LEGACY) priv_flags |= IXGBE_PRIV_FLAGS_LEGACY_RX; + if (adapter->flags2 & IXGBE_FLAG2_AUTO_DISABLE_VF) + priv_flags |= IXGBE_PRIV_FLAGS_AUTO_DISABLE_VF; + return priv_flags; } @@ -3417,11 +3422,27 @@ static int ixgbe_set_priv_flags(struct net_device *netdev, u32 priv_flags) { struct ixgbe_adapter *adapter = netdev_priv(netdev); unsigned int flags2 = adapter->flags2; + unsigned int i; flags2 &= ~IXGBE_FLAG2_RX_LEGACY; if (priv_flags & IXGBE_PRIV_FLAGS_LEGACY_RX) flags2 |= IXGBE_FLAG2_RX_LEGACY; + flags2 &= ~IXGBE_FLAG2_AUTO_DISABLE_VF; + if (priv_flags & IXGBE_PRIV_FLAGS_AUTO_DISABLE_VF) { + if (adapter->hw.mac.type == ixgbe_mac_82599EB) { + /* Reset primary abort counter */ + for (i = 0; i < adapter->num_vfs; i++) + adapter->vfinfo[i].primary_abort_count = 0; + + flags2 |= IXGBE_FLAG2_AUTO_DISABLE_VF; + } else { + e_info(probe, + "Cannot set private flags: Operation not supported\n"); + return -EOPNOTSUPP; + } + } + if (flags2 != adapter->flags2) { adapter->flags2 = flags2; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index b52585fbb295d0c9081280038cb8b7c59fb3c758..9455c619169485d317b1b4408cc1fdd23ee1bea6 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -5622,6 +5622,9 @@ static void ixgbe_up_complete(struct ixgbe_adapter *adapter) ctrl_ext = IXGBE_READ_REG(hw, IXGBE_CTRL_EXT); ctrl_ext |= IXGBE_CTRL_EXT_PFRSTD; IXGBE_WRITE_REG(hw, IXGBE_CTRL_EXT, ctrl_ext); + + /* update setting rx tx for all active vfs */ + ixgbe_set_all_vfs(adapter); } void ixgbe_reinit_locked(struct ixgbe_adapter *adapter) @@ -6074,11 +6077,8 @@ void ixgbe_down(struct ixgbe_adapter *adapter) for (i = 0 ; i < adapter->num_vfs; i++) adapter->vfinfo[i].clear_to_send = false; - /* ping all the active vfs to let them know we are going down */ - ixgbe_ping_all_vfs(adapter); - - /* Disable all VFTE/VFRE TX/RX */ - ixgbe_disable_tx_rx(adapter); + /* update setting rx tx for all active vfs */ + ixgbe_set_all_vfs(adapter); } /* disable transmits in the hardware now that interrupts are off */ @@ -7565,6 +7565,27 @@ static void ixgbe_watchdog_flush_tx(struct ixgbe_adapter *adapter) } #ifdef CONFIG_PCI_IOV +static void ixgbe_bad_vf_abort(struct ixgbe_adapter *adapter, u32 vf) +{ + struct ixgbe_hw *hw = &adapter->hw; + + if (adapter->hw.mac.type == ixgbe_mac_82599EB && + adapter->flags2 & IXGBE_FLAG2_AUTO_DISABLE_VF) { + adapter->vfinfo[vf].primary_abort_count++; + if (adapter->vfinfo[vf].primary_abort_count == + IXGBE_PRIMARY_ABORT_LIMIT) { + ixgbe_set_vf_link_state(adapter, vf, + IFLA_VF_LINK_STATE_DISABLE); + adapter->vfinfo[vf].primary_abort_count = 0; + + e_info(drv, + "Malicious Driver Detection event detected on PF %d VF %d MAC: %pM mdd-disable-vf=on", + hw->bus.func, vf, + adapter->vfinfo[vf].vf_mac_addresses); + } + } +} + static void ixgbe_check_for_bad_vf(struct ixgbe_adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; @@ -7596,8 +7617,10 @@ static void ixgbe_check_for_bad_vf(struct ixgbe_adapter *adapter) continue; pci_read_config_word(vfdev, PCI_STATUS, &status_reg); if (status_reg != IXGBE_FAILED_READ_CFG_WORD && - status_reg & PCI_STATUS_REC_MASTER_ABORT) + status_reg & PCI_STATUS_REC_MASTER_ABORT) { + ixgbe_bad_vf_abort(adapter, vf); pcie_flr(vfdev); + } } } @@ -10288,6 +10311,7 @@ static const struct net_device_ops ixgbe_netdev_ops = { .ndo_set_vf_vlan = ixgbe_ndo_set_vf_vlan, .ndo_set_vf_rate = ixgbe_ndo_set_vf_bw, .ndo_set_vf_spoofchk = ixgbe_ndo_set_vf_spoofchk, + .ndo_set_vf_link_state = ixgbe_ndo_set_vf_link_state, .ndo_set_vf_rss_query_en = ixgbe_ndo_set_vf_rss_query_en, .ndo_set_vf_trust = ixgbe_ndo_set_vf_trust, .ndo_get_vf_config = ixgbe_ndo_get_vf_config, @@ -10601,6 +10625,9 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) goto err_sw_init; + if (adapter->hw.mac.type == ixgbe_mac_82599EB) + adapter->flags2 |= IXGBE_FLAG2_AUTO_DISABLE_VF; + /* Make sure the SWFW semaphore is in a valid state */ if (hw->mac.ops.init_swfw_sync) hw->mac.ops.init_swfw_sync(hw); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h index e085b6520dac8871ebe4f117024aecd38c55b650..2246b07beae462a74fe6d59258daa414f3e5addf 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h @@ -80,6 +80,8 @@ enum ixgbe_pfvf_api_rev { #define IXGBE_VF_UPDATE_XCAST_MODE 0x0c +#define IXGBE_VF_GET_LINK_STATE 0x10 /* get vf link state */ + /* length of permanent address message returned from PF */ #define IXGBE_VF_PERMADDR_MSG_LEN 4 /* word in permanent address message with the current multicast type */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c index 8aaf856771d7be1643d2fd1a172abd89c85bd0fb..1f0e84120b32df2d9c2d0071478bd2181701a3b6 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c @@ -96,6 +96,7 @@ static int __ixgbe_enable_sriov(struct ixgbe_adapter *adapter, for (i = 0; i < num_vfs; i++) { /* enable spoof checking for all VFs */ adapter->vfinfo[i].spoofchk_enabled = true; + adapter->vfinfo[i].link_enable = true; /* We support VF RSS querying only for 82599 and x540 * devices at the moment. These devices share RSS @@ -816,6 +817,57 @@ static inline void ixgbe_write_qde(struct ixgbe_adapter *adapter, u32 vf, } } +/** + * ixgbe_set_vf_rx_tx - Set VF rx tx + * @adapter: Pointer to adapter struct + * @vf: VF identifier + * + * Set or reset correct transmit and receive for vf + **/ +static void ixgbe_set_vf_rx_tx(struct ixgbe_adapter *adapter, int vf) +{ + u32 reg_cur_tx, reg_cur_rx, reg_req_tx, reg_req_rx; + struct ixgbe_hw *hw = &adapter->hw; + u32 reg_offset, vf_shift; + + vf_shift = vf % 32; + reg_offset = vf / 32; + + reg_cur_tx = IXGBE_READ_REG(hw, IXGBE_VFTE(reg_offset)); + reg_cur_rx = IXGBE_READ_REG(hw, IXGBE_VFRE(reg_offset)); + + if (adapter->vfinfo[vf].link_enable) { + reg_req_tx = reg_cur_tx | 1 << vf_shift; + reg_req_rx = reg_cur_rx | 1 << vf_shift; + } else { + reg_req_tx = reg_cur_tx & ~(1 << vf_shift); + reg_req_rx = reg_cur_rx & ~(1 << vf_shift); + } + + /* The 82599 cannot support a mix of jumbo and non-jumbo PF/VFs. + * For more info take a look at ixgbe_set_vf_lpe + */ + if (adapter->hw.mac.type == ixgbe_mac_82599EB) { + struct net_device *dev = adapter->netdev; + int pf_max_frame = dev->mtu + ETH_HLEN; + +#if IS_ENABLED(CONFIG_FCOE) + if (dev->features & NETIF_F_FCOE_MTU) + pf_max_frame = max_t(int, pf_max_frame, + IXGBE_FCOE_JUMBO_FRAME_SIZE); +#endif /* CONFIG_FCOE */ + + if (pf_max_frame > ETH_FRAME_LEN) + reg_req_rx = reg_cur_rx & ~(1 << vf_shift); + } + + /* Enable/Disable particular VF */ + if (reg_cur_tx != reg_req_tx) + IXGBE_WRITE_REG(hw, IXGBE_VFTE(reg_offset), reg_req_tx); + if (reg_cur_rx != reg_req_rx) + IXGBE_WRITE_REG(hw, IXGBE_VFRE(reg_offset), reg_req_rx); +} + static int ixgbe_vf_reset_msg(struct ixgbe_adapter *adapter, u32 vf) { struct ixgbe_ring_feature *vmdq = &adapter->ring_feature[RING_F_VMDQ]; @@ -841,11 +893,6 @@ static int ixgbe_vf_reset_msg(struct ixgbe_adapter *adapter, u32 vf) vf_shift = vf % 32; reg_offset = vf / 32; - /* enable transmit for vf */ - reg = IXGBE_READ_REG(hw, IXGBE_VFTE(reg_offset)); - reg |= BIT(vf_shift); - IXGBE_WRITE_REG(hw, IXGBE_VFTE(reg_offset), reg); - /* force drop enable for all VF Rx queues */ reg = IXGBE_QDE_ENABLE; if (adapter->vfinfo[vf].pf_vlan) @@ -853,27 +900,7 @@ static int ixgbe_vf_reset_msg(struct ixgbe_adapter *adapter, u32 vf) ixgbe_write_qde(adapter, vf, reg); - /* enable receive for vf */ - reg = IXGBE_READ_REG(hw, IXGBE_VFRE(reg_offset)); - reg |= BIT(vf_shift); - /* - * The 82599 cannot support a mix of jumbo and non-jumbo PF/VFs. - * For more info take a look at ixgbe_set_vf_lpe - */ - if (adapter->hw.mac.type == ixgbe_mac_82599EB) { - struct net_device *dev = adapter->netdev; - int pf_max_frame = dev->mtu + ETH_HLEN; - -#ifdef CONFIG_FCOE - if (dev->features & NETIF_F_FCOE_MTU) - pf_max_frame = max_t(int, pf_max_frame, - IXGBE_FCOE_JUMBO_FRAME_SIZE); - -#endif /* CONFIG_FCOE */ - if (pf_max_frame > ETH_FRAME_LEN) - reg &= ~BIT(vf_shift); - } - IXGBE_WRITE_REG(hw, IXGBE_VFRE(reg_offset), reg); + ixgbe_set_vf_rx_tx(adapter, vf); /* enable VF mailbox for further messages */ adapter->vfinfo[vf].clear_to_send = true; @@ -1193,6 +1220,25 @@ static int ixgbe_update_vf_xcast_mode(struct ixgbe_adapter *adapter, return 0; } +static int ixgbe_get_vf_link_state(struct ixgbe_adapter *adapter, + u32 *msgbuf, u32 vf) +{ + u32 *link_state = &msgbuf[1]; + + /* verify the PF is supporting the correct API */ + switch (adapter->vfinfo[vf].vf_api) { + case ixgbe_mbox_api_12: + case ixgbe_mbox_api_13: + break; + default: + return -EOPNOTSUPP; + } + + *link_state = adapter->vfinfo[vf].link_enable; + + return 0; +} + static int ixgbe_rcv_msg_from_vf(struct ixgbe_adapter *adapter, u32 vf) { u32 mbx_size = IXGBE_VFMAILBOX_SIZE; @@ -1258,6 +1304,9 @@ static int ixgbe_rcv_msg_from_vf(struct ixgbe_adapter *adapter, u32 vf) case IXGBE_VF_UPDATE_XCAST_MODE: retval = ixgbe_update_vf_xcast_mode(adapter, msgbuf, vf); break; + case IXGBE_VF_GET_LINK_STATE: + retval = ixgbe_get_vf_link_state(adapter, msgbuf, vf); + break; default: e_err(drv, "Unhandled Msg %8.8x\n", msgbuf[0]); retval = IXGBE_ERR_MBX; @@ -1307,18 +1356,6 @@ void ixgbe_msg_task(struct ixgbe_adapter *adapter) } } -void ixgbe_disable_tx_rx(struct ixgbe_adapter *adapter) -{ - struct ixgbe_hw *hw = &adapter->hw; - - /* disable transmit and receive for all vfs */ - IXGBE_WRITE_REG(hw, IXGBE_VFTE(0), 0); - IXGBE_WRITE_REG(hw, IXGBE_VFTE(1), 0); - - IXGBE_WRITE_REG(hw, IXGBE_VFRE(0), 0); - IXGBE_WRITE_REG(hw, IXGBE_VFRE(1), 0); -} - static inline void ixgbe_ping_vf(struct ixgbe_adapter *adapter, int vf) { struct ixgbe_hw *hw = &adapter->hw; @@ -1344,6 +1381,21 @@ void ixgbe_ping_all_vfs(struct ixgbe_adapter *adapter) } } +/** + * ixgbe_set_all_vfs - update vfs queues + * @adapter: Pointer to adapter struct + * + * Update setting transmit and receive queues for all vfs + **/ +void ixgbe_set_all_vfs(struct ixgbe_adapter *adapter) +{ + int i; + + for (i = 0 ; i < adapter->num_vfs; i++) + ixgbe_set_vf_link_state(adapter, i, + adapter->vfinfo[i].link_state); +} + int ixgbe_ndo_set_vf_mac(struct net_device *netdev, int vf, u8 *mac) { struct ixgbe_adapter *adapter = netdev_priv(netdev); @@ -1641,6 +1693,84 @@ int ixgbe_ndo_set_vf_spoofchk(struct net_device *netdev, int vf, bool setting) return 0; } +/** + * ixgbe_set_vf_link_state - Set link state + * @adapter: Pointer to adapter struct + * @vf: VF identifier + * @state: required link state + * + * Set a link force state on/off a single vf + **/ +void ixgbe_set_vf_link_state(struct ixgbe_adapter *adapter, int vf, int state) +{ + adapter->vfinfo[vf].link_state = state; + + switch (state) { + case IFLA_VF_LINK_STATE_AUTO: + if (test_bit(__IXGBE_DOWN, &adapter->state)) + adapter->vfinfo[vf].link_enable = false; + else + adapter->vfinfo[vf].link_enable = true; + break; + case IFLA_VF_LINK_STATE_ENABLE: + adapter->vfinfo[vf].link_enable = true; + break; + case IFLA_VF_LINK_STATE_DISABLE: + adapter->vfinfo[vf].link_enable = false; + break; + } + + ixgbe_set_vf_rx_tx(adapter, vf); + + /* restart the VF */ + adapter->vfinfo[vf].clear_to_send = false; + ixgbe_ping_vf(adapter, vf); +} + +/** + * ixgbe_ndo_set_vf_link_state - Set link state + * @netdev: network interface device structure + * @vf: VF identifier + * @state: required link state + * + * Set the link state of a specified VF, regardless of physical link state + **/ +int ixgbe_ndo_set_vf_link_state(struct net_device *netdev, int vf, int state) +{ + struct ixgbe_adapter *adapter = netdev_priv(netdev); + int ret = 0; + + if (vf < 0 || vf >= adapter->num_vfs) { + dev_err(&adapter->pdev->dev, + "NDO set VF link - invalid VF identifier %d\n", vf); + return -EINVAL; + } + + switch (state) { + case IFLA_VF_LINK_STATE_ENABLE: + dev_info(&adapter->pdev->dev, + "NDO set VF %d link state %d - not supported\n", + vf, state); + break; + case IFLA_VF_LINK_STATE_DISABLE: + dev_info(&adapter->pdev->dev, + "NDO set VF %d link state disable\n", vf); + ixgbe_set_vf_link_state(adapter, vf, state); + break; + case IFLA_VF_LINK_STATE_AUTO: + dev_info(&adapter->pdev->dev, + "NDO set VF %d link state auto\n", vf); + ixgbe_set_vf_link_state(adapter, vf, state); + break; + default: + dev_err(&adapter->pdev->dev, + "NDO set VF %d - invalid link state %d\n", vf, state); + ret = -EINVAL; + } + + return ret; +} + int ixgbe_ndo_set_vf_rss_query_en(struct net_device *netdev, int vf, bool setting) { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h index 3ec21923c89cd9e38cd39327445a184f68f661c9..0690ecb8dfa348e9493fe9e9003bc3b8225a928e 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h @@ -17,8 +17,8 @@ void ixgbe_restore_vf_multicasts(struct ixgbe_adapter *adapter); #endif void ixgbe_msg_task(struct ixgbe_adapter *adapter); int ixgbe_vf_configuration(struct pci_dev *pdev, unsigned int event_mask); -void ixgbe_disable_tx_rx(struct ixgbe_adapter *adapter); void ixgbe_ping_all_vfs(struct ixgbe_adapter *adapter); +void ixgbe_set_all_vfs(struct ixgbe_adapter *adapter); int ixgbe_ndo_set_vf_mac(struct net_device *netdev, int queue, u8 *mac); int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int queue, u16 vlan, u8 qos, __be16 vlan_proto); @@ -31,7 +31,9 @@ int ixgbe_ndo_set_vf_rss_query_en(struct net_device *netdev, int vf, int ixgbe_ndo_set_vf_trust(struct net_device *netdev, int vf, bool setting); int ixgbe_ndo_get_vf_config(struct net_device *netdev, int vf, struct ifla_vf_info *ivi); +int ixgbe_ndo_set_vf_link_state(struct net_device *netdev, int vf, int state); void ixgbe_check_vf_rate_limit(struct ixgbe_adapter *adapter); +void ixgbe_set_vf_link_state(struct ixgbe_adapter *adapter, int vf, int state); int ixgbe_disable_sriov(struct ixgbe_adapter *adapter); #ifdef CONFIG_PCI_IOV void ixgbe_enable_sriov(struct ixgbe_adapter *adapter, unsigned int max_vfs); diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h index 56a1031dcc074deccaa65eb6e2eaac3800016175..469df8a242e82d726ad1c1fd6f3638c8b998639d 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h @@ -383,6 +383,8 @@ struct ixgbevf_adapter { u32 *rss_key; u8 rss_indir_tbl[IXGBEVF_X550_VFRETA_SIZE]; u32 flags; + bool link_state; + #define IXGBEVF_FLAGS_LEGACY_RX BIT(1) }; diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 25785fee5422e4ca3d7cf86475d78fe493ae3036..450c0a95d36d81f5521f46b710f975cec565cd59 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -2263,7 +2263,9 @@ static void ixgbevf_negotiate_api(struct ixgbevf_adapter *adapter) static void ixgbevf_up_complete(struct ixgbevf_adapter *adapter) { struct net_device *netdev = adapter->netdev; + struct pci_dev *pdev = adapter->pdev; struct ixgbe_hw *hw = &adapter->hw; + bool state; ixgbevf_configure_msix(adapter); @@ -2276,6 +2278,11 @@ static void ixgbevf_up_complete(struct ixgbevf_adapter *adapter) spin_unlock_bh(&adapter->mbx_lock); + state = adapter->link_state; + hw->mac.ops.get_link_state(hw, &adapter->link_state); + if (state && state != adapter->link_state) + dev_info(&pdev->dev, "VF is administratively disabled\n"); + smp_mb__before_atomic(); clear_bit(__IXGBEVF_DOWN, &adapter->state); ixgbevf_napi_enable_all(adapter); @@ -3045,6 +3052,8 @@ static int ixgbevf_sw_init(struct ixgbevf_adapter *adapter) adapter->tx_ring_count = IXGBEVF_DEFAULT_TXD; adapter->rx_ring_count = IXGBEVF_DEFAULT_RXD; + adapter->link_state = true; + set_bit(__IXGBEVF_DOWN, &adapter->state); return 0; @@ -3277,7 +3286,7 @@ static void ixgbevf_watchdog_subtask(struct ixgbevf_adapter *adapter) ixgbevf_watchdog_update_link(adapter); - if (adapter->link_up) + if (adapter->link_up && adapter->link_state) ixgbevf_watchdog_link_is_up(adapter); else ixgbevf_watchdog_link_is_down(adapter); diff --git a/drivers/net/ethernet/intel/ixgbevf/mbx.h b/drivers/net/ethernet/intel/ixgbevf/mbx.h index bfd9ae150808810947e661907cae8e7e26cd5531..464841e2131576b12cb6a30d07c9477ea9c1675a 100644 --- a/drivers/net/ethernet/intel/ixgbevf/mbx.h +++ b/drivers/net/ethernet/intel/ixgbevf/mbx.h @@ -92,6 +92,8 @@ enum ixgbe_pfvf_api_rev { #define IXGBE_VF_UPDATE_XCAST_MODE 0x0c +#define IXGBE_VF_GET_LINK_STATE 0x10 /* get vf link state */ + /* length of permanent address message returned from PF */ #define IXGBE_VF_PERMADDR_MSG_LEN 4 /* word in permanent address message with the current multicast type */ diff --git a/drivers/net/ethernet/intel/ixgbevf/vf.c b/drivers/net/ethernet/intel/ixgbevf/vf.c index 7597d099c584149f8d675dca67f67162528c033f..c96a4c3ca0754467c430743ce604bf881dee8e9a 100644 --- a/drivers/net/ethernet/intel/ixgbevf/vf.c +++ b/drivers/net/ethernet/intel/ixgbevf/vf.c @@ -570,6 +570,46 @@ static s32 ixgbevf_hv_update_xcast_mode(struct ixgbe_hw *hw, int xcast_mode) return -EOPNOTSUPP; } +/** + * ixgbevf_get_link_state_vf - Get VF link state from PF + * @hw: pointer to the HW structure + * @link_state: link state storage + * + * Returns state of the operation error or success. + */ +static s32 ixgbevf_get_link_state_vf(struct ixgbe_hw *hw, bool *link_state) +{ + u32 msgbuf[2]; + s32 ret_val; + s32 err; + + msgbuf[0] = IXGBE_VF_GET_LINK_STATE; + msgbuf[1] = 0x0; + + err = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf, 2); + + if (err || (msgbuf[0] & IXGBE_VT_MSGTYPE_NACK)) { + ret_val = IXGBE_ERR_MBX; + } else { + ret_val = 0; + *link_state = msgbuf[1]; + } + + return ret_val; +} + +/** + * ixgbevf_hv_get_link_state_vf - * Hyper-V variant - just a stub. + * @hw: unused + * @link_state: unused + * + * Hyper-V variant; there is no mailbox communication. + */ +static s32 ixgbevf_hv_get_link_state_vf(struct ixgbe_hw *hw, bool *link_state) +{ + return -EOPNOTSUPP; +} + /** * ixgbevf_set_vfta_vf - Set/Unset VLAN filter table address * @hw: pointer to the HW structure @@ -946,6 +986,7 @@ static const struct ixgbe_mac_operations ixgbevf_mac_ops = { .set_rar = ixgbevf_set_rar_vf, .update_mc_addr_list = ixgbevf_update_mc_addr_list_vf, .update_xcast_mode = ixgbevf_update_xcast_mode, + .get_link_state = ixgbevf_get_link_state_vf, .set_uc_addr = ixgbevf_set_uc_addr_vf, .set_vfta = ixgbevf_set_vfta_vf, .set_rlpml = ixgbevf_set_rlpml_vf, @@ -963,6 +1004,7 @@ static const struct ixgbe_mac_operations ixgbevf_hv_mac_ops = { .set_rar = ixgbevf_hv_set_rar_vf, .update_mc_addr_list = ixgbevf_hv_update_mc_addr_list_vf, .update_xcast_mode = ixgbevf_hv_update_xcast_mode, + .get_link_state = ixgbevf_hv_get_link_state_vf, .set_uc_addr = ixgbevf_hv_set_uc_addr_vf, .set_vfta = ixgbevf_hv_set_vfta_vf, .set_rlpml = ixgbevf_hv_set_rlpml_vf, diff --git a/drivers/net/ethernet/intel/ixgbevf/vf.h b/drivers/net/ethernet/intel/ixgbevf/vf.h index d1e9e306653b87ca9ee6e73543ca918f8a3787af..45d9269218db60bee69b23a32e6fef5eb95e7175 100644 --- a/drivers/net/ethernet/intel/ixgbevf/vf.h +++ b/drivers/net/ethernet/intel/ixgbevf/vf.h @@ -42,6 +42,7 @@ struct ixgbe_mac_operations { s32 (*init_rx_addrs)(struct ixgbe_hw *); s32 (*update_mc_addr_list)(struct ixgbe_hw *, struct net_device *); s32 (*update_xcast_mode)(struct ixgbe_hw *, int); + s32 (*get_link_state)(struct ixgbe_hw *hw, bool *link_state); s32 (*enable_mc)(struct ixgbe_hw *); s32 (*disable_mc)(struct ixgbe_hw *); s32 (*clear_vfta)(struct ixgbe_hw *); diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 1001e9a2edd4f6a9b92118b2a23db0a7f2b4ccb1..6f7e1598106f88958cc2743954092ed36877e5ba 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -311,7 +311,6 @@ static void sp_setup(struct net_device *dev) { /* Finish setting up the DEVICE info. */ dev->netdev_ops = &sp_netdev_ops; - dev->needs_free_netdev = true; dev->mtu = SIXP_MTU; dev->hard_header_len = AX25_MAX_HEADER_LEN; dev->header_ops = &ax25_header_ops; @@ -674,14 +673,16 @@ static void sixpack_close(struct tty_struct *tty) */ netif_stop_queue(sp->dev); + unregister_netdev(sp->dev); + del_timer_sync(&sp->tx_t); del_timer_sync(&sp->resync_t); - /* Free all 6pack frame buffers. */ + /* Free all 6pack frame buffers after unreg. */ kfree(sp->rbuff); kfree(sp->xbuff); - unregister_netdev(sp->dev); + free_netdev(sp->dev); } /* Perform I/O control on an active 6pack channel. */ diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c index 3b14e6e281d43507320f72d14a5397712ea48601..a6311310dcaacad612e0050d8b8e7248a95c7f03 100644 --- a/drivers/net/hamradio/mkiss.c +++ b/drivers/net/hamradio/mkiss.c @@ -803,13 +803,13 @@ static void mkiss_close(struct tty_struct *tty) */ netif_stop_queue(ax->dev); + unregister_netdev(ax->dev); + /* Free all AX25 frame buffers. */ kfree(ax->rbuff); kfree(ax->xbuff); ax->tty = NULL; - - unregister_netdev(ax->dev); } /* Perform I/O control on an active ax25 channel. */ diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 1117355313bc6656234e899f286c06a5f13bd959..2de2cbed7c5fc4e6613ee1a85932d9f9274eae9e 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1154,6 +1154,9 @@ void phy_detach(struct phy_device *phydev) phydev->mdio.dev.driver == &genphy_driver.mdiodrv.driver) device_release_driver(&phydev->mdio.dev); + /* Assert the reset signal */ + phy_device_reset(phydev, 1); + /* * The phydev might go away on the put_device() below, so avoid * a use-after-free bug by reading the underlying bus first. @@ -1163,9 +1166,6 @@ void phy_detach(struct phy_device *phydev) put_device(&phydev->mdio.dev); if (ndev_owner != bus->owner) module_put(bus->owner); - - /* Assert the reset signal */ - phy_device_reset(phydev, 1); } EXPORT_SYMBOL(phy_detach); diff --git a/drivers/net/usb/mcs7830.c b/drivers/net/usb/mcs7830.c index 5a47e5510ca8243eed8f12cc86a4ec70ac42658e..c0f52a622964f66934313ec027660abd57d8f9e8 100644 --- a/drivers/net/usb/mcs7830.c +++ b/drivers/net/usb/mcs7830.c @@ -121,8 +121,16 @@ static const char driver_name[] = "MOSCHIP usb-ethernet driver"; static int mcs7830_get_reg(struct usbnet *dev, u16 index, u16 size, void *data) { - return usbnet_read_cmd(dev, MCS7830_RD_BREQ, MCS7830_RD_BMREQ, - 0x0000, index, data, size); + int ret; + + ret = usbnet_read_cmd(dev, MCS7830_RD_BREQ, MCS7830_RD_BMREQ, + 0x0000, index, data, size); + if (ret < 0) + return ret; + else if (ret < size) + return -ENODATA; + + return ret; } static int mcs7830_set_reg(struct usbnet *dev, u16 index, u16 size, const void *data) diff --git a/drivers/net/usb/sr9700.c b/drivers/net/usb/sr9700.c index 6ac232e52bf7c683832aee40dc1d0fabcd9a6202..83640628c47dd7a201adda20de572c3698dbbeee 100644 --- a/drivers/net/usb/sr9700.c +++ b/drivers/net/usb/sr9700.c @@ -410,7 +410,7 @@ static int sr9700_rx_fixup(struct usbnet *dev, struct sk_buff *skb) /* ignore the CRC length */ len = (skb->data[1] | (skb->data[2] << 8)) - 4; - if (len > ETH_FRAME_LEN) + if (len > ETH_FRAME_LEN || len > skb->len) return 0; /* the last packet of current skb */ diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 41a00cd76955bf39047fafc0a1641eb347c6ee21..ad098169d23fddc2df282717b4d49c9ce7fbb5b7 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -152,9 +152,10 @@ static void __veth_xdp_flush(struct veth_rq *rq) { /* Write ptr_ring before reading rx_notify_masked */ smp_mb(); - if (!rq->rx_notify_masked) { - rq->rx_notify_masked = true; - napi_schedule(&rq->xdp_napi); + if (!READ_ONCE(rq->rx_notify_masked) && + napi_schedule_prep(&rq->xdp_napi)) { + WRITE_ONCE(rq->rx_notify_masked, true); + __napi_schedule(&rq->xdp_napi); } } @@ -197,8 +198,6 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) if (rxq < rcv->real_num_rx_queues) { rq = &rcv_priv->rq[rxq]; rcv_xdp = rcu_access_pointer(rq->xdp_prog); - if (rcv_xdp) - skb_record_rx_queue(skb, rxq); } if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) { @@ -625,8 +624,10 @@ static int veth_poll(struct napi_struct *napi, int budget) /* Write rx_notify_masked before reading ptr_ring */ smp_store_mb(rq->rx_notify_masked, false); if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { - rq->rx_notify_masked = true; - napi_schedule(&rq->xdp_napi); + if (napi_schedule_prep(&rq->xdp_napi)) { + WRITE_ONCE(rq->rx_notify_masked, true); + __napi_schedule(&rq->xdp_napi); + } } } diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index da1b93f98cd1aae6b4c26884fe8fb1b31e6240b2..376024c262705d7a708190eea46f77319e024bf7 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -412,14 +412,12 @@ static bool xennet_tx_buf_gc(struct netfront_queue *queue) queue->tx_link[id] = TX_LINK_NONE; skb = queue->tx_skbs[id]; queue->tx_skbs[id] = NULL; - if (unlikely(gnttab_query_foreign_access( - queue->grant_tx_ref[id]) != 0)) { + if (unlikely(!gnttab_end_foreign_access_ref( + queue->grant_tx_ref[id], GNTMAP_readonly))) { dev_alert(dev, "Grant still in use by backend domain\n"); goto err; } - gnttab_end_foreign_access_ref( - queue->grant_tx_ref[id], GNTMAP_readonly); gnttab_release_grant_reference( &queue->gref_tx_head, queue->grant_tx_ref[id]); queue->grant_tx_ref[id] = GRANT_INVALID_REF; @@ -840,7 +838,6 @@ static int xennet_get_responses(struct netfront_queue *queue, int max = XEN_NETIF_NR_SLOTS_MIN + (rx->status <= RX_COPY_THRESHOLD); int slots = 1; int err = 0; - unsigned long ret; if (rx->flags & XEN_NETRXF_extra_info) { err = xennet_get_extras(queue, extras, rp); @@ -871,8 +868,13 @@ static int xennet_get_responses(struct netfront_queue *queue, goto next; } - ret = gnttab_end_foreign_access_ref(ref, 0); - BUG_ON(!ret); + if (!gnttab_end_foreign_access_ref(ref, 0)) { + dev_alert(dev, + "Grant still in use by backend domain\n"); + queue->info->broken = true; + dev_alert(dev, "Disabled for further use\n"); + return -EINVAL; + } gnttab_release_grant_reference(&queue->gref_rx_head, ref); @@ -1076,6 +1078,10 @@ static int xennet_poll(struct napi_struct *napi, int budget) err = xennet_get_responses(queue, &rinfo, rp, &tmpq); if (unlikely(err)) { + if (queue->info->broken) { + spin_unlock(&queue->rx_lock); + return 0; + } err: while ((skb = __skb_dequeue(&tmpq))) __skb_queue_tail(&errq, skb); @@ -1646,7 +1652,7 @@ static int setup_netfront(struct xenbus_device *dev, struct netfront_queue *queue, unsigned int feature_split_evtchn) { struct xen_netif_tx_sring *txs; - struct xen_netif_rx_sring *rxs; + struct xen_netif_rx_sring *rxs = NULL; grant_ref_t gref; int err; @@ -1666,21 +1672,21 @@ static int setup_netfront(struct xenbus_device *dev, err = xenbus_grant_ring(dev, txs, 1, &gref); if (err < 0) - goto grant_tx_ring_fail; + goto fail; queue->tx_ring_ref = gref; rxs = (struct xen_netif_rx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH); if (!rxs) { err = -ENOMEM; xenbus_dev_fatal(dev, err, "allocating rx ring page"); - goto alloc_rx_ring_fail; + goto fail; } SHARED_RING_INIT(rxs); FRONT_RING_INIT(&queue->rx, rxs, XEN_PAGE_SIZE); err = xenbus_grant_ring(dev, rxs, 1, &gref); if (err < 0) - goto grant_rx_ring_fail; + goto fail; queue->rx_ring_ref = gref; if (feature_split_evtchn) @@ -1693,22 +1699,28 @@ static int setup_netfront(struct xenbus_device *dev, err = setup_netfront_single(queue); if (err) - goto alloc_evtchn_fail; + goto fail; return 0; /* If we fail to setup netfront, it is safe to just revoke access to * granted pages because backend is not accessing it at this point. */ -alloc_evtchn_fail: - gnttab_end_foreign_access_ref(queue->rx_ring_ref, 0); -grant_rx_ring_fail: - free_page((unsigned long)rxs); -alloc_rx_ring_fail: - gnttab_end_foreign_access_ref(queue->tx_ring_ref, 0); -grant_tx_ring_fail: - free_page((unsigned long)txs); -fail: + fail: + if (queue->rx_ring_ref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(queue->rx_ring_ref, 0, + (unsigned long)rxs); + queue->rx_ring_ref = GRANT_INVALID_REF; + } else { + free_page((unsigned long)rxs); + } + if (queue->tx_ring_ref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(queue->tx_ring_ref, 0, + (unsigned long)txs); + queue->tx_ring_ref = GRANT_INVALID_REF; + } else { + free_page((unsigned long)txs); + } return err; } diff --git a/drivers/nfc/st21nfca/se.c b/drivers/nfc/st21nfca/se.c index fd967a38a94a5da4c4687fd9d058120887c3f8d7..ced3c20d645394e3d731ba2ef94ab32eb52b8a3f 100644 --- a/drivers/nfc/st21nfca/se.c +++ b/drivers/nfc/st21nfca/se.c @@ -332,6 +332,11 @@ int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host, return -ENOMEM; transaction->aid_len = skb->data[1]; + + /* Checking if the length of the AID is valid */ + if (transaction->aid_len > sizeof(transaction->aid)) + return -EINVAL; + memcpy(transaction->aid, &skb->data[2], transaction->aid_len); @@ -341,6 +346,11 @@ int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host, return -EPROTO; transaction->params_len = skb->data[transaction->aid_len + 3]; + + /* Total size is allocated (skb->len - 2) minus fixed array members */ + if (transaction->params_len > ((skb->len - 2) - sizeof(struct nfc_evt_transaction))) + return -EINVAL; + memcpy(transaction->params, skb->data + transaction->aid_len + 4, transaction->params_len); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index feba1a34c15ed7f8d2d65acb5a9980ca6fda2d13..164ec45dac8444c10d728525d2babff60841287f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3683,7 +3683,14 @@ static void nvme_async_event_work(struct work_struct *work) container_of(work, struct nvme_ctrl, async_event_work); nvme_aen_uevent(ctrl); - ctrl->ops->submit_async_event(ctrl); + + /* + * The transport drivers must guarantee AER submission here is safe by + * flushing ctrl async_event_work after changing the controller state + * from LIVE and before freeing the admin queue. + */ + if (ctrl->state == NVME_CTRL_LIVE) + ctrl->ops->submit_async_event(ctrl); } static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index c49d1fc86cc168257a8d64c5f30d1b6c5702133b..ac552a85a9b6e1027ed4a29ee0331cc5ecf2fb39 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -609,6 +609,8 @@ static irqreturn_t pciehp_isr(int irq, void *dev_id) */ if (ctrl->power_fault_detected) status &= ~PCI_EXP_SLTSTA_PFD; + else if (status & PCI_EXP_SLTSTA_PFD) + ctrl->power_fault_detected = true; events |= status; if (!events) { @@ -618,7 +620,7 @@ static irqreturn_t pciehp_isr(int irq, void *dev_id) } if (status) { - pcie_capability_write_word(pdev, PCI_EXP_SLTSTA, events); + pcie_capability_write_word(pdev, PCI_EXP_SLTSTA, status); /* * In MSI mode, all event bits must be zero before the port @@ -706,8 +708,7 @@ static irqreturn_t pciehp_ist(int irq, void *dev_id) } /* Check Power Fault Detected */ - if ((events & PCI_EXP_SLTSTA_PFD) && !ctrl->power_fault_detected) { - ctrl->power_fault_detected = 1; + if (events & PCI_EXP_SLTSTA_PFD) { ctrl_err(ctrl, "Slot(%s): Power fault\n", slot_name(slot)); pciehp_set_attention_status(slot, 1); pciehp_green_led_off(slot); diff --git a/drivers/scsi/hisi_sas/hisi_sas.h b/drivers/scsi/hisi_sas/hisi_sas.h index 2a70af0191af0b8ebb78eaab7883f22dd2032d61..a791401be7b37add95e58eb513763026ed5fd481 100644 --- a/drivers/scsi/hisi_sas/hisi_sas.h +++ b/drivers/scsi/hisi_sas/hisi_sas.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -87,6 +88,7 @@ #define HISI_SAS_PROT_MASK (HISI_SAS_DIF_PROT_MASK | HISI_SAS_DIX_PROT_MASK) +#define HISI_SAS_WAIT_PHYUP_TIMEOUT (30 * HZ) #define CLEAR_ITCT_TIMEOUT 20 struct hisi_hba; @@ -572,12 +574,6 @@ struct hisi_sas_slot_dif_buf_table { struct hisi_sas_sge_dif_page sge_dif_page; }; -struct hisi_sas_sense_data { - int sense_key; - int add_sense_code; - int add_sense_code_qua; -}; - extern bool hisi_sas_debugfs_enable; extern struct dentry *hisi_sas_debugfs_dir; extern int skip_bus_flag; diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index 32f55248c356d4b8777365053ea95974e9a792a4..93df1c6b94c759cb76a547aab58d484f4c6bc471 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -1277,8 +1277,7 @@ static int hisi_sas_exec_internal_tmf_task(struct domain_device *device, /* no error, but return the number of bytes of * underrun */ - dev_warn(dev, "abort tmf: task to dev %016llx " - "resp: 0x%x sts 0x%x underrun\n", + dev_warn(dev, "abort tmf: task to dev %016llx resp: 0x%x sts 0x%x underrun\n", SAS_ADDR(device->sas_addr), task->task_status.resp, task->task_status.stat); @@ -1293,10 +1292,16 @@ static int hisi_sas_exec_internal_tmf_task(struct domain_device *device, break; } - dev_warn(dev, "abort tmf: task to dev " - "%016llx resp: 0x%x status 0x%x\n", - SAS_ADDR(device->sas_addr), task->task_status.resp, - task->task_status.stat); + if (task->task_status.resp == SAS_TASK_COMPLETE && + task->task_status.stat == SAS_OPEN_REJECT) { + dev_warn(dev, "abort tmf: open reject failed\n"); + res = -EIO; + } else { + dev_warn(dev, "abort tmf: task to dev %016llx resp: 0x%x status 0x%x\n", + SAS_ADDR(device->sas_addr), + task->task_status.resp, + task->task_status.stat); + } sas_free_task(task); task = NULL; } @@ -1778,6 +1783,7 @@ static int hisi_sas_debug_I_T_nexus_reset(struct domain_device *device) int rc, reset_type = (sas_dev->dev_status == HISI_SAS_DEV_INIT || !dev_is_sata(device)) ? true : false; struct hisi_hba *hisi_hba = dev_to_hisi_hba(device); + struct device *dev = hisi_hba->dev; struct sas_ha_struct *sas_ha = &hisi_hba->sha; DECLARE_COMPLETION_ONSTACK(phyreset); @@ -1804,7 +1810,8 @@ static int hisi_sas_debug_I_T_nexus_reset(struct domain_device *device) struct hisi_sas_phy *phy = container_of(sas_phy, struct hisi_sas_phy, sas_phy); /* Wait for I_T reset complete, time out after 2s */ - int ret = wait_for_completion_timeout(&phyreset, 2 * HZ); + int ret = wait_for_completion_timeout(&phyreset, + HISI_SAS_WAIT_PHYUP_TIMEOUT); unsigned long flags; spin_lock_irqsave(&phy->lock, flags); @@ -1813,8 +1820,10 @@ static int hisi_sas_debug_I_T_nexus_reset(struct domain_device *device) spin_unlock_irqrestore(&phy->lock, flags); /* report PHY down if timed out */ - if (!ret) + if (!ret) { + dev_warn(dev, "phy%d reset timeout\n", sas_phy->id); hisi_sas_phy_down(hisi_hba, sas_phy->id, 0); + } } else if (sas_dev->dev_status != HISI_SAS_DEV_INIT) /* Sleep 2s, wait for I_T reset at expander env except fail */ if (!rc) @@ -1840,7 +1849,8 @@ static int hisi_sas_I_T_nexus_reset(struct domain_device *device) if (dev_is_sata(device)) { rc = hisi_sas_softreset_ata_disk(device); if (rc) - return TMF_RESP_FUNC_FAILED; + dev_err(dev, "I_T nexus reset: softreset failed (%d)\n", + rc); } rc = hisi_sas_debug_I_T_nexus_reset(device); @@ -2124,10 +2134,8 @@ _hisi_sas_internal_task_abort(struct hisi_hba *hisi_hba, } exit: - dev_dbg(dev, "internal task abort: task to dev %016llx task=%pK " - "resp: 0x%x sts 0x%x\n", - SAS_ADDR(device->sas_addr), - task, + dev_dbg(dev, "internal task abort: task to dev %016llx task=%p resp: 0x%x sts 0x%x\n", + SAS_ADDR(device->sas_addr), task, task->task_status.resp, /* 0 is complete, -1 is undelivered */ task->task_status.stat); sas_free_task(task); diff --git a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c index 61ff065e5d9b8b13ca84b4814c2af78a0494107e..e6e4897780142e808c50eb6ec904b7f900d329fe 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c @@ -2874,7 +2874,6 @@ static const struct hisi_sas_hw_error port_ecc_axi_error[] = { }, }; -#define WAIT_PHYUP_TIMEOUT_V2_HW 20 static void wait_phyup_timedout_v2_hw(struct timer_list *t) { struct hisi_sas_phy *phy = from_timer(phy, t, timer); @@ -2894,7 +2893,7 @@ static void phy_oob_ready_v2_hw(struct hisi_hba *hisi_hba, int phy_no) if (!timer_pending(&phy->timer)) { dev_dbg(dev, "phy%d OOB ready\n", phy_no); phy->timer.function = wait_phyup_timedout_v2_hw; - phy->timer.expires = jiffies + WAIT_PHYUP_TIMEOUT_V2_HW * HZ; + phy->timer.expires = jiffies + HISI_SAS_WAIT_PHYUP_TIMEOUT; add_timer(&phy->timer); } } diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index 3c662fa5f6c0c197fb9d0b39aa925327b75f84e0..baae04adc8e47e05a974a04354941058180140ae 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -1890,7 +1890,6 @@ static void handle_chl_int2_v3_hw(struct hisi_hba *hisi_hba, int phy_no) hisi_sas_phy_write32(hisi_hba, phy_no, CHL_INT2, irq_value); } -#define WAIT_PHYUP_TIMEOUT_V3_HW 20 static void wait_phyup_timedout_v3_hw(struct timer_list *t) { struct hisi_sas_phy *phy = from_timer(phy, t, timer); @@ -1917,7 +1916,7 @@ static void handle_chl_int0_v3_hw(struct hisi_hba *hisi_hba, int phy_no) if (!timer_pending(&phy->timer)) { phy->timer.function = wait_phyup_timedout_v3_hw; phy->timer.expires = jiffies + - WAIT_PHYUP_TIMEOUT_V3_HW * HZ; + HISI_SAS_WAIT_PHYUP_TIMEOUT; add_timer(&phy->timer); } } @@ -2291,56 +2290,24 @@ slot_err_v3_hw(struct hisi_hba *hisi_hba, struct sas_task *task, } } -static int hisi_sas_get_sense_data(struct ssp_response_iu *iu, - struct hisi_sas_sense_data *sense_data) -{ - int rc = 0; - - switch (iu->resp_data[0]) { - case 0x70: - case 0x71: { - if (iu->sense_data_len > 13) { - sense_data->sense_key = (iu->resp_data[2] & 0xF); - sense_data->add_sense_code = iu->resp_data[12]; - sense_data->add_sense_code_qua = iu->resp_data[13]; - } else - rc = -1; - break; - } - case 0x72: - case 0x73: { - if (iu->sense_data_len > 4) { - sense_data->sense_key = (iu->resp_data[1] & 0xF); - sense_data->add_sense_code = iu->resp_data[2]; - sense_data->add_sense_code_qua = iu->resp_data[3]; - } else - rc = -1; - break; - } - default: - rc = -2; - break; - } - - return rc; -} - static int ssp_need_spin_up(struct hisi_sas_slot *slot) { - int rc; - struct hisi_sas_sense_data sense_data; + bool rc; + int sb_len; + u8 *sense_buffer; + struct scsi_sense_hdr sshdr; struct ssp_response_iu *iu = hisi_sas_status_buf_addr_mem(slot) + sizeof(struct hisi_sas_err_record); - rc = hisi_sas_get_sense_data(iu, &sense_data); + sb_len = iu->sense_data_len; + sense_buffer = iu->sense_data; + rc = scsi_normalize_sense(sense_buffer, sb_len, &sshdr); /* * if the SAS disk response with ASC=04h, * ASCQ=11h, host should send NOTIFY primitive. */ - if (rc == 0 && - sense_data.add_sense_code == 0x4 && - sense_data.add_sense_code_qua == 0x11) + if (rc && sshdr.asc == 0x4 && sshdr.ascq == 0x11) return 1; return 0; @@ -2444,22 +2411,26 @@ slot_complete_v3_hw(struct hisi_hba *hisi_hba, struct hisi_sas_slot *slot) if ((error_info[3] & RX_DATA_LEN_UNDERFLOW_MSK) && (task->task_proto == SAS_PROTOCOL_SSP)) { /*print detail sense info when data underflow happened*/ - int rc; - struct hisi_sas_sense_data sense_data; + bool rc; + int sb_len; + u8 *sense_buffer; + struct scsi_sense_hdr sshdr; struct ssp_response_iu *iu = hisi_sas_status_buf_addr_mem(slot) + sizeof(struct hisi_sas_err_record); - rc = hisi_sas_get_sense_data(iu, &sense_data); - if (!rc) + sb_len = iu->sense_data_len; + sense_buffer = iu->sense_data; + rc = scsi_normalize_sense(sense_buffer, sb_len, &sshdr); + if (rc) dev_info(dev, "data underflow, rsp_code:0x%x, sensekey:0x%x, ASC:0x%x, ASCQ:0x%x.\n", - iu->resp_data[0], - sense_data.sense_key, - sense_data.add_sense_code, - sense_data.add_sense_code_qua); + sshdr.response_code, + sshdr.sense_key, + sshdr.asc, + sshdr.ascq); else - dev_info(dev, "data underflow without sense, rsp_code:0x%x, rc:%d.\n", - iu->resp_data[0], rc); + dev_info(dev, "data underflow without sense, rsp_code:0x%02x.\n", + iu->resp_data[0]); } if (unlikely(slot->abort)) { sas_task_abort(task); diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 02060aec08bf991b8e97395a1adc460fc298b311..a93a47d4931f66aeddfb6c84d4f0d47ff5d50bb5 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -3133,6 +3133,8 @@ void iscsi_conn_teardown(struct iscsi_cls_conn *cls_conn) { struct iscsi_conn *conn = cls_conn->dd_data; struct iscsi_session *session = conn->session; + char *tmp_persistent_address = conn->persistent_address; + char *tmp_local_ipaddr = conn->local_ipaddr; del_timer_sync(&conn->transport_timer); @@ -3156,8 +3158,6 @@ void iscsi_conn_teardown(struct iscsi_cls_conn *cls_conn) spin_lock_bh(&session->frwd_lock); free_pages((unsigned long) conn->data, get_order(ISCSI_DEF_MAX_RECV_SEG_LEN)); - kfree(conn->persistent_address); - kfree(conn->local_ipaddr); /* regular RX path uses back_lock */ spin_lock_bh(&session->back_lock); kfifo_in(&session->cmdpool.queue, (void*)&conn->login_task, @@ -3169,6 +3169,8 @@ void iscsi_conn_teardown(struct iscsi_cls_conn *cls_conn) mutex_unlock(&session->eh_mutex); iscsi_destroy_conn(cls_conn); + kfree(tmp_persistent_address); + kfree(tmp_local_ipaddr); } EXPORT_SYMBOL_GPL(iscsi_conn_teardown); diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index be2daf5536ff7be1172d8162181141d32a7bca01..180087d1c6cdbb9b20558e4fbdd7713e9670d2a2 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -885,7 +885,7 @@ static void get_capabilities(struct scsi_cd *cd) /* allocate transfer buffer */ - buffer = kmalloc(512, GFP_KERNEL | GFP_DMA); + buffer = kmalloc(512, GFP_KERNEL); if (!buffer) { sr_printk(KERN_ERR, cd, "out of memory.\n"); return; diff --git a/drivers/scsi/sr_vendor.c b/drivers/scsi/sr_vendor.c index e3b0ce25162baa10ff1447f5f488376720f4249e..2887be4316be92386e9cc5547d8b73637425c2c2 100644 --- a/drivers/scsi/sr_vendor.c +++ b/drivers/scsi/sr_vendor.c @@ -119,7 +119,7 @@ int sr_set_blocklength(Scsi_CD *cd, int blocklength) density = (blocklength > 2048) ? 0x81 : 0x83; #endif - buffer = kmalloc(512, GFP_KERNEL | GFP_DMA); + buffer = kmalloc(512, GFP_KERNEL); if (!buffer) return -ENOMEM; @@ -167,7 +167,7 @@ int sr_cd_check(struct cdrom_device_info *cdi) if (cd->cdi.mask & CDC_MULTI_SESSION) return 0; - buffer = kmalloc(512, GFP_KERNEL | GFP_DMA); + buffer = kmalloc(512, GFP_KERNEL); if (!buffer) return -ENOMEM; diff --git a/drivers/scsi/ufs/tc-dwc-g210-pci.c b/drivers/scsi/ufs/tc-dwc-g210-pci.c index 2f41722a8c28dc064bd3e5523be420ecf89671db..2c6cb7f6b61a6afd9d10de570bfe3bd21c4ec3a0 100644 --- a/drivers/scsi/ufs/tc-dwc-g210-pci.c +++ b/drivers/scsi/ufs/tc-dwc-g210-pci.c @@ -138,7 +138,6 @@ tc_dwc_g210_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) return err; } - pci_set_drvdata(pdev, hba); pm_runtime_put_noidle(&pdev->dev); pm_runtime_allow(&pdev->dev); diff --git a/drivers/scsi/ufs/ufshcd-pltfrm.c b/drivers/scsi/ufs/ufshcd-pltfrm.c index 30c22e16b1e34b49fcf4f2f52f3a2e9854c193ac..57985841a879e3763c0dc6ef2b7dd9937c589cb3 100644 --- a/drivers/scsi/ufs/ufshcd-pltfrm.c +++ b/drivers/scsi/ufs/ufshcd-pltfrm.c @@ -348,8 +348,6 @@ int ufshcd_pltfrm_init(struct platform_device *pdev, goto dealloc_host; } - platform_set_drvdata(pdev, hba); - pm_runtime_set_active(&pdev->dev); pm_runtime_enable(&pdev->dev); diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 3d9bfa52076890abc7c29b818713fa2b9217f07e..2082907bea5008fded1594ecacba8d71a1c19b6e 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -8032,6 +8032,13 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) struct Scsi_Host *host = hba->host; struct device *dev = hba->dev; + /* + * dev_set_drvdata() must be called before any callbacks are registered + * that use dev_get_drvdata() (frequency scaling, clock scaling, hwmon, + * sysfs). + */ + dev_set_drvdata(dev, hba); + if (!mmio_base) { dev_err(hba->dev, "Invalid memory reference for mmio_base is NULL\n"); diff --git a/drivers/scsi/xen-scsifront.c b/drivers/scsi/xen-scsifront.c index 61389bdc7926690100fc0a38fc59e8b6a73853ab..a1d822ae9fcaaedcb3bdd0437568f3382d9532b1 100644 --- a/drivers/scsi/xen-scsifront.c +++ b/drivers/scsi/xen-scsifront.c @@ -233,12 +233,11 @@ static void scsifront_gnttab_done(struct vscsifrnt_info *info, return; for (i = 0; i < shadow->nr_grants; i++) { - if (unlikely(gnttab_query_foreign_access(shadow->gref[i]))) { + if (unlikely(!gnttab_try_end_foreign_access(shadow->gref[i]))) { shost_printk(KERN_ALERT, info->host, KBUILD_MODNAME "grant still in use by backend\n"); BUG(); } - gnttab_end_foreign_access(shadow->gref[i], 0, 0UL); } kfree(shadow->sg); diff --git a/drivers/target/iscsi/iscsi_target_tpg.c b/drivers/target/iscsi/iscsi_target_tpg.c index 101d62105c9325d939ff86aa20091266dea15827..f3671ffdf14953530e39523244f094a5f96b44e8 100644 --- a/drivers/target/iscsi/iscsi_target_tpg.c +++ b/drivers/target/iscsi/iscsi_target_tpg.c @@ -451,6 +451,9 @@ static bool iscsit_tpg_check_network_portal( break; } spin_unlock(&tpg->tpg_np_lock); + + if (match) + break; } spin_unlock(&tiqn->tiqn_tpg_lock); diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c index 86b7e20ffd7f1e90f1087f9212a85f201c057287..3f08ad7e629ae72f3674286869401f59bcf6aefe 100644 --- a/drivers/tty/n_gsm.c +++ b/drivers/tty/n_gsm.c @@ -313,6 +313,7 @@ static struct tty_driver *gsm_tty_driver; #define GSM1_ESCAPE_BITS 0x20 #define XON 0x11 #define XOFF 0x13 +#define ISO_IEC_646_MASK 0x7F static const struct tty_port_operations gsm_port_ops; @@ -427,7 +428,7 @@ static u8 gsm_encode_modem(const struct gsm_dlci *dlci) modembits |= MDM_RTR; if (dlci->modem_tx & TIOCM_RI) modembits |= MDM_IC; - if (dlci->modem_tx & TIOCM_CD) + if (dlci->modem_tx & TIOCM_CD || dlci->gsm->initiator) modembits |= MDM_DV; return modembits; } @@ -531,7 +532,8 @@ static int gsm_stuff_frame(const u8 *input, u8 *output, int len) int olen = 0; while (len--) { if (*input == GSM1_SOF || *input == GSM1_ESCAPE - || *input == XON || *input == XOFF) { + || (*input & ISO_IEC_646_MASK) == XON + || (*input & ISO_IEC_646_MASK) == XOFF) { *output++ = GSM1_ESCAPE; *output++ = *input++ ^ GSM1_ESCAPE_BITS; olen++; @@ -1484,7 +1486,7 @@ static void gsm_dlci_t1(struct timer_list *t) dlci->mode = DLCI_MODE_ADM; gsm_dlci_open(dlci); } else { - gsm_dlci_close(dlci); + gsm_dlci_begin_close(dlci); /* prevent half open link */ } break; diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c index 9369f5d2fdf5145972e07990b71560e7da44dd96..a8d7ab7ee057ec43462a7f3551e675d36069a162 100644 --- a/drivers/tty/n_tty.c +++ b/drivers/tty/n_tty.c @@ -1375,7 +1375,7 @@ n_tty_receive_char_special(struct tty_struct *tty, unsigned char c) put_tty_queue(c, ldata); smp_store_release(&ldata->canon_head, ldata->read_head); kill_fasync(&tty->fasync, SIGIO, POLL_IN); - wake_up_interruptible_poll(&tty->read_wait, EPOLLIN); + wake_up_interruptible_poll(&tty->read_wait, EPOLLIN | EPOLLRDNORM); return 0; } } @@ -1656,7 +1656,7 @@ static void __receive_buf(struct tty_struct *tty, const unsigned char *cp, if (read_cnt(ldata)) { kill_fasync(&tty->fasync, SIGIO, POLL_IN); - wake_up_interruptible_poll(&tty->read_wait, EPOLLIN); + wake_up_interruptible_poll(&tty->read_wait, EPOLLIN | EPOLLRDNORM); } } diff --git a/drivers/tty/serial/8250/8250_gsc.c b/drivers/tty/serial/8250/8250_gsc.c index 0809ae2aa9b141f4ef91b60518c7a566b09d8409..51cc985216ff3d96703346e440e50ce436b199c4 100644 --- a/drivers/tty/serial/8250/8250_gsc.c +++ b/drivers/tty/serial/8250/8250_gsc.c @@ -26,7 +26,7 @@ static int __init serial_init_chip(struct parisc_device *dev) unsigned long address; int err; -#ifdef CONFIG_64BIT +#if defined(CONFIG_64BIT) && defined(CONFIG_IOSAPIC) if (!dev->irq && (dev->id.sversion == 0xad)) dev->irq = iosapic_serial_irq(dev); #endif diff --git a/drivers/tty/serial/8250/8250_of.c b/drivers/tty/serial/8250/8250_of.c index 2488de1c4bc4b9ddeaff6f4f572806537b9be795..dcd2105e2b60da20c8420ba13af8c9b55de37047 100644 --- a/drivers/tty/serial/8250/8250_of.c +++ b/drivers/tty/serial/8250/8250_of.c @@ -104,8 +104,17 @@ static int of_platform_serial_setup(struct platform_device *ofdev, port->mapsize = resource_size(&resource); /* Check for shifted address mapping */ - if (of_property_read_u32(np, "reg-offset", &prop) == 0) + if (of_property_read_u32(np, "reg-offset", &prop) == 0) { + if (prop >= port->mapsize) { + dev_warn(&ofdev->dev, "reg-offset %u exceeds region size %pa\n", + prop, &port->mapsize); + ret = -EINVAL; + goto err_unprepare; + } + port->mapbase += prop; + port->mapsize -= prop; + } port->iotype = UPIO_MEM; if (of_property_read_u32(np, "reg-io-width", &prop) == 0) { diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 1867c2546cd977e3345bde434d82c9e3ca5de21f..3a0604b025e94d92520b25e52f838fd675c57480 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -2647,6 +2647,8 @@ static unsigned int serial8250_get_baud_rate(struct uart_port *port, struct ktermios *termios, struct ktermios *old) { + unsigned int tolerance = port->uartclk / 100; + /* * Ask the core to calculate the divisor for us. * Allow 1% tolerance at the upper limit so uart clks marginally @@ -2655,7 +2657,7 @@ static unsigned int serial8250_get_baud_rate(struct uart_port *port, */ return uart_get_baud_rate(port, termios, old, port->uartclk / 16 / UART_DIV_MAX, - port->uartclk); + (port->uartclk + tolerance) / 16); } void diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c index 1d27e006826d28fd9c535075d5891532a59bd86c..aae97acd183a6b4f3d9b6b099f7bd38730766ead 100644 --- a/drivers/tty/serial/amba-pl011.c +++ b/drivers/tty/serial/amba-pl011.c @@ -2160,32 +2160,13 @@ static const char *pl011_type(struct uart_port *port) return uap->port.type == PORT_AMBA ? uap->type : NULL; } -/* - * Release the memory region(s) being used by 'port' - */ -static void pl011_release_port(struct uart_port *port) -{ - release_mem_region(port->mapbase, SZ_4K); -} - -/* - * Request the memory region(s) being used by 'port' - */ -static int pl011_request_port(struct uart_port *port) -{ - return request_mem_region(port->mapbase, SZ_4K, "uart-pl011") - != NULL ? 0 : -EBUSY; -} - /* * Configure/autoconfigure the port. */ static void pl011_config_port(struct uart_port *port, int flags) { - if (flags & UART_CONFIG_TYPE) { + if (flags & UART_CONFIG_TYPE) port->type = PORT_AMBA; - pl011_request_port(port); - } } /* @@ -2200,6 +2181,8 @@ static int pl011_verify_port(struct uart_port *port, struct serial_struct *ser) ret = -EINVAL; if (ser->baud_base < 9600) ret = -EINVAL; + if (port->mapbase != (unsigned long) ser->iomem_base) + ret = -EINVAL; return ret; } @@ -2217,8 +2200,6 @@ static const struct uart_ops amba_pl011_pops = { .flush_buffer = pl011_dma_flush_buffer, .set_termios = pl011_set_termios, .type = pl011_type, - .release_port = pl011_release_port, - .request_port = pl011_request_port, .config_port = pl011_config_port, .verify_port = pl011_verify_port, #ifdef CONFIG_CONSOLE_POLL @@ -2248,8 +2229,6 @@ static const struct uart_ops sbsa_uart_pops = { .shutdown = sbsa_uart_shutdown, .set_termios = sbsa_uart_set_termios, .type = pl011_type, - .release_port = pl011_release_port, - .request_port = pl011_request_port, .config_port = pl011_config_port, .verify_port = pl011_verify_port, #ifdef CONFIG_CONSOLE_POLL diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 4e60045487a8604703d87027b24cde5aa96badd3..8d8d63c3ca7d65586842cf6f1d5546b105ca75d3 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -159,7 +159,7 @@ static void uart_port_dtr_rts(struct uart_port *uport, int raise) int RTS_after_send = !!(uport->rs485.flags & SER_RS485_RTS_AFTER_SEND); if (raise) { - if (rs485_on && !RTS_after_send) { + if (rs485_on && RTS_after_send) { uart_set_mctrl(uport, TIOCM_DTR); uart_clear_mctrl(uport, TIOCM_RTS); } else { @@ -168,7 +168,7 @@ static void uart_port_dtr_rts(struct uart_port *uport, int raise) } else { unsigned int clear = TIOCM_DTR; - clear |= (!rs485_on || !RTS_after_send) ? TIOCM_RTS : 0; + clear |= (!rs485_on || RTS_after_send) ? TIOCM_RTS : 0; uart_clear_mctrl(uport, clear); } } diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c index e8d7a7bb4339e4ef40fc22cb433588c54dbd43c0..f761e1038c3466eeba7ffd047782070bff3d33bc 100644 --- a/drivers/tty/serial/stm32-usart.c +++ b/drivers/tty/serial/stm32-usart.c @@ -506,7 +506,7 @@ static void stm32_start_tx(struct uart_port *port) { struct circ_buf *xmit = &port->state->xmit; - if (uart_circ_empty(xmit)) + if (uart_circ_empty(xmit) && !port->x_char) return; stm32_transmit_chars(port); diff --git a/drivers/tty/serial/uartlite.c b/drivers/tty/serial/uartlite.c index 8df30582266877106c9fc61b93c031598214d92c..5d1b7455e627d45dbc3a4029828d25b6253eb7db 100644 --- a/drivers/tty/serial/uartlite.c +++ b/drivers/tty/serial/uartlite.c @@ -618,7 +618,7 @@ static struct uart_driver ulite_uart_driver = { * * Returns: 0 on success, <0 otherwise */ -static int ulite_assign(struct device *dev, int id, u32 base, int irq, +static int ulite_assign(struct device *dev, int id, phys_addr_t base, int irq, struct uartlite_data *pdata) { struct uart_port *port; diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 3025c39ba6b176e9372bc916141ba9b337cbb7f3..708be625860947943638221681ffba8f05c69ba4 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -1343,9 +1343,12 @@ struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx) if (!tty->port) tty->port = driver->ports[idx]; - WARN_RATELIMIT(!tty->port, - "%s: %s driver does not set tty->port. This will crash the kernel later. Fix the driver!\n", - __func__, tty->driver->name); + if (WARN_RATELIMIT(!tty->port, + "%s: %s driver does not set tty->port. This would crash the kernel. Fix the driver!\n", + __func__, tty->driver->name)) { + retval = -EINVAL; + goto err_release_lock; + } retval = tty_ldisc_lock(tty, 5 * HZ); if (retval) @@ -1509,10 +1512,12 @@ static void release_tty(struct tty_struct *tty, int idx) tty->ops->shutdown(tty); tty_save_termios(tty); tty_driver_remove_tty(tty->driver, tty); - tty->port->itty = NULL; + if (tty->port) + tty->port->itty = NULL; if (tty->link) tty->link->port->itty = NULL; - tty_buffer_cancel_work(tty->port); + if (tty->port) + tty_buffer_cancel_work(tty->port); if (tty->link) tty_buffer_cancel_work(tty->link->port); diff --git a/drivers/usb/gadget/udc/udc-xilinx.c b/drivers/usb/gadget/udc/udc-xilinx.c index 6407e433bc78de40d3292a0461a93136d4b63e0c..72f1bc6a680e37d7f29431325550f1c3fa29e7ff 100644 --- a/drivers/usb/gadget/udc/udc-xilinx.c +++ b/drivers/usb/gadget/udc/udc-xilinx.c @@ -1613,6 +1613,8 @@ static void xudc_getstatus(struct xusb_udc *udc) break; case USB_RECIP_ENDPOINT: epnum = udc->setup.wIndex & USB_ENDPOINT_NUMBER_MASK; + if (epnum >= XUSB_MAX_ENDPOINTS) + goto stall; target_ep = &udc->ep[epnum]; epcfgreg = udc->read_fn(udc->addr + target_ep->offset); halt = epcfgreg & XUSB_EP_CFG_STALL_MASK; @@ -1680,6 +1682,10 @@ static void xudc_set_clear_feature(struct xusb_udc *udc) case USB_RECIP_ENDPOINT: if (!udc->setup.wValue) { endpoint = udc->setup.wIndex & USB_ENDPOINT_NUMBER_MASK; + if (endpoint >= XUSB_MAX_ENDPOINTS) { + xudc_ep0_stall(udc); + return; + } target_ep = &udc->ep[endpoint]; outinbit = udc->setup.wIndex & USB_ENDPOINT_DIR_MASK; outinbit = outinbit >> 7; diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index 3fa40c723e8e95cb85d8c4ee5449ac716cb43e63..edb0acd0b8323cad2165d23fb4764358af8a45b8 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -169,20 +169,14 @@ static int add_grefs(struct ioctl_gntalloc_alloc_gref *op, __del_gref(gref); } - /* It's possible for the target domain to map the just-allocated grant - * references by blindly guessing their IDs; if this is done, then - * __del_gref will leave them in the queue_gref list. They need to be - * added to the global list so that we can free them when they are no - * longer referenced. - */ - if (unlikely(!list_empty(&queue_gref))) - list_splice_tail(&queue_gref, &gref_list); mutex_unlock(&gref_mutex); return rc; } static void __del_gref(struct gntalloc_gref *gref) { + unsigned long addr; + if (gref->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { uint8_t *tmp = kmap(gref->page); tmp[gref->notify.pgoff] = 0; @@ -196,21 +190,16 @@ static void __del_gref(struct gntalloc_gref *gref) gref->notify.flags = 0; if (gref->gref_id) { - if (gnttab_query_foreign_access(gref->gref_id)) - return; - - if (!gnttab_end_foreign_access_ref(gref->gref_id, 0)) - return; - - gnttab_free_grant_reference(gref->gref_id); + if (gref->page) { + addr = (unsigned long)page_to_virt(gref->page); + gnttab_end_foreign_access(gref->gref_id, 0, addr); + } else + gnttab_free_grant_reference(gref->gref_id); } gref_size--; list_del(&gref->next_gref); - if (gref->page) - __free_page(gref->page); - kfree(gref); } diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 97341fa75458ee7d563bb3643ea71509f183ec69..e434a583ee7e2a9c77de6ffc90c3ecddd7c3f748 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -135,12 +135,9 @@ struct gnttab_ops { */ unsigned long (*end_foreign_transfer_ref)(grant_ref_t ref); /* - * Query the status of a grant entry. Ref parameter is reference of - * queried grant entry, return value is the status of queried entry. - * Detailed status(writing/reading) can be gotten from the return value - * by bit operations. + * Read the frame number related to a given grant reference. */ - int (*query_foreign_access)(grant_ref_t ref); + unsigned long (*read_frame)(grant_ref_t ref); }; struct unmap_refs_callback_data { @@ -285,22 +282,6 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, } EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); -static int gnttab_query_foreign_access_v1(grant_ref_t ref) -{ - return gnttab_shared.v1[ref].flags & (GTF_reading|GTF_writing); -} - -static int gnttab_query_foreign_access_v2(grant_ref_t ref) -{ - return grstatus[ref] & (GTF_reading|GTF_writing); -} - -int gnttab_query_foreign_access(grant_ref_t ref) -{ - return gnttab_interface->query_foreign_access(ref); -} -EXPORT_SYMBOL_GPL(gnttab_query_foreign_access); - static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly) { u16 flags, nflags; @@ -354,6 +335,16 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) } EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref); +static unsigned long gnttab_read_frame_v1(grant_ref_t ref) +{ + return gnttab_shared.v1[ref].frame; +} + +static unsigned long gnttab_read_frame_v2(grant_ref_t ref) +{ + return gnttab_shared.v2[ref].full_page.frame; +} + struct deferred_entry { struct list_head list; grant_ref_t ref; @@ -383,12 +374,9 @@ static void gnttab_handle_deferred(struct timer_list *unused) spin_unlock_irqrestore(&gnttab_list_lock, flags); if (_gnttab_end_foreign_access_ref(entry->ref, entry->ro)) { put_free_entry(entry->ref); - if (entry->page) { - pr_debug("freeing g.e. %#x (pfn %#lx)\n", - entry->ref, page_to_pfn(entry->page)); - put_page(entry->page); - } else - pr_info("freeing g.e. %#x\n", entry->ref); + pr_debug("freeing g.e. %#x (pfn %#lx)\n", + entry->ref, page_to_pfn(entry->page)); + put_page(entry->page); kfree(entry); entry = NULL; } else { @@ -413,9 +401,18 @@ static void gnttab_handle_deferred(struct timer_list *unused) static void gnttab_add_deferred(grant_ref_t ref, bool readonly, struct page *page) { - struct deferred_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + struct deferred_entry *entry; + gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL; const char *what = KERN_WARNING "leaking"; + entry = kmalloc(sizeof(*entry), gfp); + if (!page) { + unsigned long gfn = gnttab_interface->read_frame(ref); + + page = pfn_to_page(gfn_to_pfn(gfn)); + get_page(page); + } + if (entry) { unsigned long flags; @@ -436,11 +433,21 @@ static void gnttab_add_deferred(grant_ref_t ref, bool readonly, what, ref, page ? page_to_pfn(page) : -1); } +int gnttab_try_end_foreign_access(grant_ref_t ref) +{ + int ret = _gnttab_end_foreign_access_ref(ref, 0); + + if (ret) + put_free_entry(ref); + + return ret; +} +EXPORT_SYMBOL_GPL(gnttab_try_end_foreign_access); + void gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page) { - if (gnttab_end_foreign_access_ref(ref, readonly)) { - put_free_entry(ref); + if (gnttab_try_end_foreign_access(ref)) { if (page != 0) put_page(virt_to_page(page)); } else @@ -1297,7 +1304,7 @@ static const struct gnttab_ops gnttab_v1_ops = { .update_entry = gnttab_update_entry_v1, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v1, .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v1, - .query_foreign_access = gnttab_query_foreign_access_v1, + .read_frame = gnttab_read_frame_v1, }; static const struct gnttab_ops gnttab_v2_ops = { @@ -1309,7 +1316,7 @@ static const struct gnttab_ops gnttab_v2_ops = { .update_entry = gnttab_update_entry_v2, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v2, .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v2, - .query_foreign_access = gnttab_query_foreign_access_v2, + .read_frame = gnttab_read_frame_v2, }; static bool gnttab_need_v2(void) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index d7438fdc57061ef9b1479e4e39fc88540e1f5cd3..285bb8390a97bab0c1cc940d9ce033e15b600d4e 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -346,8 +346,8 @@ static void free_active_ring(struct sock_mapping *map) if (!map->active.ring) return; - free_pages((unsigned long)map->active.data.in, - map->active.ring->ring_order); + free_pages_exact(map->active.data.in, + PAGE_SIZE << map->active.ring->ring_order); free_page((unsigned long)map->active.ring); } @@ -361,8 +361,8 @@ static int alloc_active_ring(struct sock_mapping *map) goto out; map->active.ring->ring_order = PVCALLS_RING_ORDER; - bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - PVCALLS_RING_ORDER); + bytes = alloc_pages_exact(PAGE_SIZE << PVCALLS_RING_ORDER, + GFP_KERNEL | __GFP_ZERO); if (!bytes) goto out; diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 6f6f6db948385f806497895c5cf5f007a7934f89..365cf9f5c9671c0333b7e079e1378b4116e80e38 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -368,27 +368,31 @@ int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr, unsigned int nr_pages, grant_ref_t *grefs) { int err; - int i, j; + unsigned int i; + grant_ref_t gref_head; + + err = gnttab_alloc_grant_references(nr_pages, &gref_head); + if (err) { + xenbus_dev_fatal(dev, err, "granting access to ring page"); + return err; + } for (i = 0; i < nr_pages; i++) { - err = gnttab_grant_foreign_access(dev->otherend_id, - virt_to_gfn(vaddr), 0); - if (err < 0) { - xenbus_dev_fatal(dev, err, - "granting access to ring page"); - goto fail; - } - grefs[i] = err; + unsigned long gfn; + + if (is_vmalloc_addr(vaddr)) + gfn = pfn_to_gfn(vmalloc_to_pfn(vaddr)); + else + gfn = virt_to_gfn(vaddr); + + grefs[i] = gnttab_claim_grant_reference(&gref_head); + gnttab_grant_foreign_access_ref(grefs[i], dev->otherend_id, + gfn, 0); vaddr = vaddr + XEN_PAGE_SIZE; } return 0; - -fail: - for (j = 0; j < i; j++) - gnttab_end_foreign_access_ref(grefs[j], 0); - return err; } EXPORT_SYMBOL_GPL(xenbus_grant_ring); diff --git a/fs/block_dev.c b/fs/block_dev.c index 8b299347f2aa981f92b3cbf546cc4e9fe01942b7..9868b21b8ef945868fb6f91f14c1b55b62f7d1a7 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -253,6 +253,9 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, task_io_account_write(ret); } + if (iocb->ki_flags & IOCB_NOWAIT) + bio.bi_opf |= REQ_NOWAIT; + qc = submit_bio(&bio); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -407,6 +410,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) bio->bi_opf = dio_bio_write_op(iocb); task_io_account_write(bio->bi_iter.bi_size); } + if (iocb->ki_flags & IOCB_NOWAIT) + bio->bi_opf |= REQ_NOWAIT; dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index bc906fcf3f6dbca0c2ec2c1c3ccab9282925d1e2..baa1713d66958659b7134139adb7028448d1e794 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -779,6 +779,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, out_super: deactivate_locked_super(sb); + return root; out: cifs_cleanup_volume_info(volume_info); return root; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6787d12d5abf3fbb83b51850baed0ac4ecb19da7..16e2b719d47c2d22d76d905893efc8995e129b1d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1985,6 +1985,10 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) * Structure of a directory entry */ #define EXT4_NAME_LEN 255 +/* + * Base length of the ext4 directory entry excluding the name length + */ +#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN) struct ext4_dir_entry { __le32 inode; /* Inode number */ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index f36936bcd26078fb3d292d255606b128501a6386..b5d172bf33187d672c9fbfb0750e12b220daa93d 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1126,7 +1126,15 @@ static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc, void *buf, int inline_size) { - ext4_create_inline_data(handle, inode, inline_size); + int ret; + + ret = ext4_create_inline_data(handle, inode, inline_size); + if (ret) { + ext4_msg(inode->i_sb, KERN_EMERG, + "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)", + inode->i_ino, ret); + return; + } ext4_write_inline_data(inode, iloc, buf, 0, inline_size); ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); } @@ -1767,19 +1775,20 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) void *inline_pos; unsigned int offset; struct ext4_dir_entry_2 *de; - bool ret = true; + bool ret = false; err = ext4_get_inode_loc(dir, &iloc); if (err) { EXT4_ERROR_INODE_ERR(dir, -err, "error %d getting inode %lu block", err, dir->i_ino); - return true; + return false; } down_read(&EXT4_I(dir)->xattr_sem); if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; + ret = true; goto out; } @@ -1788,7 +1797,6 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) ext4_warning(dir->i_sb, "bad inline directory (dir #%lu) - no `..'", dir->i_ino); - ret = true; goto out; } @@ -1807,16 +1815,15 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) dir->i_ino, le32_to_cpu(de->inode), le16_to_cpu(de->rec_len), de->name_len, inline_size); - ret = true; goto out; } if (le32_to_cpu(de->inode)) { - ret = false; goto out; } offset += ext4_rec_len_from_disk(de->rec_len, inline_size); } + ret = true; out: up_read(&EXT4_I(dir)->xattr_sem); brelse(iloc.bh); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index c6cac2d40f710b70fad419a49065b5b67f21f3a1..3c8324715e6568115c037b0e63431cf125954321 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1056,8 +1056,6 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) sizeof(range))) return -EFAULT; - range.minlen = max((unsigned int)range.minlen, - q->limits.discard_granularity); ret = ext4_trim_fs(sb, &range); if (ret < 0) return ret; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 69ed137d07a9777c76fa7afdeae20db405c51396..990fe7eed6bf31c16f22d2d7a9abd5642db19b04 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5262,6 +5262,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, */ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); struct ext4_group_info *grp; ext4_group_t group, first_group, last_group; ext4_grpblk_t cnt = 0, first_cluster, last_cluster; @@ -5280,6 +5281,13 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) start >= max_blks || range->len < sb->s_blocksize) return -EINVAL; + /* No point to try to trim less than discard granularity */ + if (range->minlen < q->limits.discard_granularity) { + minlen = EXT4_NUM_B2C(EXT4_SB(sb), + q->limits.discard_granularity >> sb->s_blocksize_bits); + if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) + goto out; + } if (end >= max_blks) end = max_blks - 1; if (end <= first_data_blk) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 7eb036cd1d93b0426e61bbaa617ad033c9c988ac..75a769634b2b32337ff7041aa830295a6a2e81a0 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -435,12 +435,12 @@ int ext4_ext_migrate(struct inode *inode) percpu_down_write(&sbi->s_writepages_rwsem); /* - * Worst case we can touch the allocation bitmaps, a bgd - * block, and a block to link in the orphan list. We do need - * need to worry about credits for modifying the quota inode. + * Worst case we can touch the allocation bitmaps and a block + * group descriptor block. We do need need to worry about + * credits for modifying the quota inode. */ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, - 4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); + 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); @@ -457,6 +457,13 @@ int ext4_ext_migrate(struct inode *inode) ext4_journal_stop(handle); goto out_unlock; } + /* + * Use the correct seed for checksum (i.e. the seed from 'inode'). This + * is so that the metadata blocks will have the correct checksum after + * the migration. + */ + ei = EXT4_I(inode); + EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed; i_size_write(tmp_inode, i_size_read(inode)); /* * Set the i_nlink to zero so it will be deleted later @@ -465,7 +472,6 @@ int ext4_ext_migrate(struct inode *inode) clear_nlink(tmp_inode); ext4_ext_tree_init(handle, tmp_inode); - ext4_orphan_add(handle, tmp_inode); ext4_journal_stop(handle); /* @@ -490,17 +496,10 @@ int ext4_ext_migrate(struct inode *inode) handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { - /* - * It is impossible to update on-disk structures without - * a handle, so just rollback in-core changes and live other - * work to orphan_list_cleanup() - */ - ext4_orphan_del(NULL, tmp_inode); retval = PTR_ERR(handle); goto out_tmp_inode; } - ei = EXT4_I(inode); i_data = ei->i_data; memset(&lb, 0, sizeof(lb)); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 84f68e15e853bb1a9b6202413d4767c3da151ebc..e3e57eab371f13f94d2e5f97ad6f92a7442063bd 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1304,10 +1304,10 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; - while ((char *) de < dlimit) { + while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ - if ((char *) de + de->name_len <= dlimit && + if (de->name + de->name_len <= dlimit && ext4_match(fname, de)) { /* found a match - just to be sure, do * a full check */ @@ -2743,7 +2743,7 @@ bool ext4_empty_dir(struct inode *inode) sb = inode->i_sb; if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) { EXT4_ERROR_INODE(inode, "invalid size"); - return true; + return false; } /* The first directory block must not be a hole, * so treat it as DIRENT_HTREE @@ -2758,7 +2758,7 @@ bool ext4_empty_dir(struct inode *inode) le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) { ext4_warning_inode(inode, "directory missing '.'"); brelse(bh); - return true; + return false; } offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); de = ext4_next_entry(de, sb->s_blocksize); @@ -2767,7 +2767,7 @@ bool ext4_empty_dir(struct inode *inode) le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { ext4_warning_inode(inode, "directory missing '..'"); brelse(bh); - return true; + return false; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); while (offset < inode->i_size) { diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 9cc79b7b0df110bb0bac5b5b91fdc6e1109ff398..3de933354a08b1039cc1021304f0bd5bfe129965 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -105,8 +105,10 @@ static void ext4_finish_bio(struct bio *bio) continue; } clear_buffer_async_write(bh); - if (bio->bi_status) + if (bio->bi_status) { + set_buffer_write_io_error(bh); buffer_io_error(bh); + } } while ((bh = bh->b_this_page) != head); bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); local_irq_restore(flags); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 08d67f79aed7bf5f674b63a1164835c040a55c4f..7441e28d270605be0257cefbc03b2340446e7be2 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -74,6 +74,11 @@ int ext4_resize_begin(struct super_block *sb) return -EPERM; } + if (ext4_has_feature_sparse_super2(sb)) { + ext4_msg(sb, KERN_ERR, "Online resizing not supported with sparse_super2"); + return -EOPNOTSUPP; + } + if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, &EXT4_SB(sb)->s_ext4_flags)) ret = -EBUSY; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8f9d60ec607ea364cd75ad79c2c14b5874133ad7..4d9de6216fe4b7807d0c1b2cbcf4679cac17218d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1154,6 +1154,18 @@ static void ext4_put_super(struct super_block *sb) int aborted = 0; int i, err; + /* + * Unregister sysfs before destroying jbd2 journal. + * Since we could still access attr_journal_task attribute via sysfs + * path which could have sbi->s_journal->j_task as NULL + * Unregister sysfs before flush sbi->s_error_work. + * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If + * read metadata verify failed then will queue error work. + * flush_stashed_error_work will call start_this_handle may trigger + * BUG_ON. + */ + ext4_unregister_sysfs(sb); + ext4_unregister_li_request(sb); ext4_quota_off_umount(sb); @@ -1169,7 +1181,6 @@ static void ext4_put_super(struct super_block *sb) } } - ext4_unregister_sysfs(sb); ext4_es_unregister_shrinker(sbi); del_timer_sync(&sbi->s_err_report); ext4_release_system_zone(sb); @@ -6080,10 +6091,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); err = dquot_quota_on(sb, type, format_id, path); - if (err) { - lockdep_set_quota_inode(path->dentry->d_inode, - I_DATA_SEM_NORMAL); - } else { + if (!err) { struct inode *inode = d_inode(path->dentry); handle_t *handle; @@ -6103,7 +6111,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, ext4_journal_stop(handle); unlock_inode: inode_unlock(inode); + if (err) + dquot_quota_off(sb, type); } + if (err) + lockdep_set_quota_inode(path->dentry->d_inode, + I_DATA_SEM_NORMAL); return err; } @@ -6166,8 +6179,19 @@ static int ext4_enable_quotas(struct super_block *sb) "Failed to enable quota tracking " "(type=%d, err=%d). Please run " "e2fsck to fix.", type, err); - for (type--; type >= 0; type--) + for (type--; type >= 0; type--) { + struct inode *inode; + + inode = sb_dqopt(sb)->files[type]; + if (inode) + inode = igrab(inode); dquot_quota_off(sb, type); + if (inode) { + lockdep_set_quota_inode(inode, + I_DATA_SEM_NORMAL); + iput(inode); + } + } return err; } diff --git a/fs/file.c b/fs/file.c index 20a44d0ac400b2f8e92ed88a64fada9823aab4f8..eb10b7634d76e09b40253651e4ee8c56d30108c6 100644 --- a/fs/file.c +++ b/fs/file.c @@ -762,28 +762,69 @@ void do_close_on_exec(struct files_struct *files) spin_unlock(&files->file_lock); } -static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs) +static inline struct file *__fget_files_rcu(struct files_struct *files, + unsigned int fd, fmode_t mask, unsigned int refs) { - struct files_struct *files = current->files; - struct file *file; + for (;;) { + struct file *file; + struct fdtable *fdt = rcu_dereference_raw(files->fdt); + struct file __rcu **fdentry; - rcu_read_lock(); -loop: - file = fcheck_files(files, fd); - if (file) { - /* File object ref couldn't be taken. - * dup2() atomicity guarantee is the reason - * we loop to catch the new file (or NULL pointer) + if (unlikely(fd >= fdt->max_fds)) + return NULL; + + fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds); + file = rcu_dereference_raw(*fdentry); + if (unlikely(!file)) + return NULL; + + if (unlikely(file->f_mode & mask)) + return NULL; + + /* + * Ok, we have a file pointer. However, because we do + * this all locklessly under RCU, we may be racing with + * that file being closed. + * + * Such a race can take two forms: + * + * (a) the file ref already went down to zero, + * and get_file_rcu_many() fails. Just try + * again: + */ + if (unlikely(!get_file_rcu_many(file, refs))) + continue; + + /* + * (b) the file table entry has changed under us. + * Note that we don't need to re-check the 'fdt->fd' + * pointer having changed, because it always goes + * hand-in-hand with 'fdt'. + * + * If so, we need to put our refs and try again. */ - if (file->f_mode & mask) - file = NULL; - else if (!get_file_rcu_many(file, refs)) - goto loop; - else if (__fcheck_files(files, fd) != file) { + if (unlikely(rcu_dereference_raw(files->fdt) != fdt) || + unlikely(rcu_dereference_raw(*fdentry) != file)) { fput_many(file, refs); - goto loop; + continue; } + + /* + * Ok, we have a ref to the file, and checked that it + * still exists. + */ + return file; } +} + + +static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs) +{ + struct files_struct *files = current->files; + struct file *file; + + rcu_read_lock(); + file = __fget_files_rcu(files, fd, mask, refs); rcu_read_unlock(); return file; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8a98ffd5e4b67500d76e94fde34e790e5cac6408..bd5293e01e48695fb52bf9998b012f7f58b1c833 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -999,7 +999,17 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, while (count) { if (cs->write && cs->pipebufs && page) { - return fuse_ref_page(cs, page, offset, count); + /* + * Can't control lifetime of pipe buffers, so always + * copy user pages. + */ + if (cs->req->in.user_pages) { + err = fuse_copy_fill(cs); + if (err) + return err; + } else { + return fuse_ref_page(cs, page, offset, count); + } } else if (!cs->len) { if (cs->move_pages && page && offset == 0 && count == PAGE_SIZE) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5ecabacabd47757203f8f01d17b6623abfeb15d1..8d238d965db17b6da66bbf5824e0625f0399d515 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1321,10 +1321,12 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } - if (write) + if (write) { + req->in.user_pages = 1; req->in.argpages = 1; - else + } else { req->out.argpages = 1; + } *nbytesp = nbytes; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 853d37ec81e0c1ccefd9b86223d0eec1bbd09eeb..0981011c10c6e2f5c3ada577ebdaafc0f6c98d8f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -174,6 +174,11 @@ struct fuse_in { /** True if the data for the last argument is in req->pages */ unsigned argpages:1; +#ifndef __GENKSYMS__ + /** True if direct write */ + unsigned user_pages:1; +#endif + /** Number of arguments */ unsigned numargs; diff --git a/fs/io-wq.c b/fs/io-wq.c index 9c223dfb4b3f79a35d314c5cc9125e73702aaaae..b6e311e57b47cd7742d9fc8b5b47e4e3f2be3807 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -282,7 +282,8 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) * Most likely an attempt to queue unbounded work on an io_wq that * wasn't setup with any unbounded workers. */ - WARN_ON_ONCE(!acct->max_workers); + if (unlikely(!acct->max_workers)) + pr_warn_once("io-wq is not configured for unbound workers"); rcu_read_lock(); ret = io_wqe_activate_free_worker(wqe); @@ -1050,6 +1051,8 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) if (WARN_ON_ONCE(!data->free_work || !data->do_work)) return ERR_PTR(-EINVAL); + if (WARN_ON_ONCE(!bounded)) + return ERR_PTR(-EINVAL); wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) diff --git a/fs/io_uring.c b/fs/io_uring.c index f933d4f0edb4c11d93a23ac02e269f47d6999593..c104425b2557510f49f8d647f81d94fadf19e9da 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -917,6 +917,13 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_comp_state *cs); +static ssize_t io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock); +static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter); + static struct kmem_cache *req_cachep; static const struct file_operations io_uring_fops; @@ -1325,6 +1332,7 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx) static void io_flush_timeouts(struct io_ring_ctx *ctx) { + struct io_kiocb *req, *tmp; u32 seq; if (list_empty(&ctx->timeout_list)) @@ -1332,10 +1340,8 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); - do { + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { u32 events_needed, events_got; - struct io_kiocb *req = list_first_entry(&ctx->timeout_list, - struct io_kiocb, timeout.list); if (io_is_timeout_noseq(req)) break; @@ -1352,9 +1358,8 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) if (events_got < events_needed) break; - list_del_init(&req->timeout.list); io_kill_timeout(req); - } while (!list_empty(&ctx->timeout_list)); + } ctx->cq_last_tm_flush = seq; } @@ -2363,10 +2368,92 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res, __io_req_complete(req, res, cflags, cs); } +#ifdef CONFIG_BLOCK +static bool io_resubmit_prep(struct io_kiocb *req, int error) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + ssize_t ret; + struct iov_iter iter; + int rw; + + if (error) { + ret = error; + goto end_req; + } + + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: + rw = READ; + break; + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: + rw = WRITE; + break; + default: + printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n", + req->opcode); + goto end_req; + } + + if (!req->io) { + ret = io_import_iovec(rw, req, &iovec, &iter, false); + if (ret < 0) + goto end_req; + ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter); + if (!ret) + return true; + kfree(iovec); + } else { + return true; + } +end_req: + req_set_fail_links(req); + return false; +} + +static void io_rw_resubmit(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; + int err; + + __set_current_state(TASK_RUNNING); + + err = io_sq_thread_acquire_mm_files(ctx, req); + + if (io_resubmit_prep(req, err)) { + refcount_inc(&req->refs); + io_queue_async_work(req); + } +} +#endif + +static bool io_rw_reissue(struct io_kiocb *req, long res) +{ +#ifdef CONFIG_BLOCK + struct task_struct *tsk; + int ret; + + if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) + return false; + + tsk = req->task; + init_task_work(&req->task_work, io_rw_resubmit); + ret = task_work_add(tsk, &req->task_work, true); + if (!ret) + return true; +#endif + return false; +} + static void __io_complete_rw(struct io_kiocb *req, long res, long res2, struct io_comp_state *cs) { - io_complete_rw_common(&req->rw.kiocb, res, cs); + if (!io_rw_reissue(req, res)) + io_complete_rw_common(&req->rw.kiocb, res, cs); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) @@ -2536,6 +2623,9 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (kiocb->ki_flags & IOCB_NOWAIT) req->flags |= REQ_F_NOWAIT; + if (kiocb->ki_flags & IOCB_DIRECT) + io_get_req_task(req); + if (force_nonblock) kiocb->ki_flags |= IOCB_NOWAIT; @@ -3037,6 +3127,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); if (!ret) { + unsigned long nr_segs = iter.nr_segs; ssize_t ret2; if (req->file->f_op->read_iter) @@ -3054,6 +3145,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, goto copy_iov; kiocb_done(kiocb, ret2, cs); } else { + iter.count = iov_count; + iter.nr_segs = nr_segs; copy_iov: ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); @@ -3120,6 +3213,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); if (!ret) { + unsigned long nr_segs = iter.nr_segs; ssize_t ret2; /* @@ -3157,6 +3251,8 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, goto copy_iov; kiocb_done(kiocb, ret2, cs); } else { + iter.count = iov_count; + iter.nr_segs = nr_segs; copy_iov: ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); @@ -5115,6 +5211,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, else data->mode = HRTIMER_MODE_REL; + INIT_LIST_HEAD(&req->timeout.list); hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); return 0; } @@ -5515,6 +5612,7 @@ static void __io_clean_op(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); req->flags &= ~REQ_F_INFLIGHT; + put_files_struct(req->work.files); } } @@ -5892,7 +5990,7 @@ static int io_grab_files(struct io_kiocb *req) if (fcheck(ctx->ring_fd) == ctx->ring_file) { list_add(&req->inflight_entry, &ctx->inflight_list); req->flags |= REQ_F_INFLIGHT; - req->work.files = current->files; + req->work.files = get_files_struct(current); ret = 0; } spin_unlock_irq(&ctx->inflight_lock); @@ -5933,6 +6031,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) prev = NULL; } + list_del(&req->timeout.list); spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 7d8654a1472ef8802b4b4f3769b818d4a2843c2f..3047872fdac9bca6ac3ae2370bbb0b9ada7f12ce 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -135,20 +135,15 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, struct page *pg; struct inode *inode = mapping->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); pgoff_t index = pos >> PAGE_SHIFT; uint32_t pageofs = index << PAGE_SHIFT; int ret = 0; - pg = grab_cache_page_write_begin(mapping, index, flags); - if (!pg) - return -ENOMEM; - *pagep = pg; - jffs2_dbg(1, "%s()\n", __func__); if (pageofs > inode->i_size) { /* Make new hole frag from old EOF to new page */ - struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); struct jffs2_raw_inode ri; struct jffs2_full_dnode *fn; uint32_t alloc_len; @@ -159,7 +154,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); if (ret) - goto out_page; + goto out_err; mutex_lock(&f->sem); memset(&ri, 0, sizeof(ri)); @@ -189,7 +184,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, ret = PTR_ERR(fn); jffs2_complete_reservation(c); mutex_unlock(&f->sem); - goto out_page; + goto out_err; } ret = jffs2_add_full_dnode_to_inode(c, f, fn); if (f->metadata) { @@ -204,13 +199,26 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, jffs2_free_full_dnode(fn); jffs2_complete_reservation(c); mutex_unlock(&f->sem); - goto out_page; + goto out_err; } jffs2_complete_reservation(c); inode->i_size = pageofs; mutex_unlock(&f->sem); } + /* + * While getting a page and reading data in, lock c->alloc_sem until + * the page is Uptodate. Otherwise GC task may attempt to read the same + * page in read_cache_page(), which causes a deadlock. + */ + mutex_lock(&c->alloc_sem); + pg = grab_cache_page_write_begin(mapping, index, flags); + if (!pg) { + ret = -ENOMEM; + goto release_sem; + } + *pagep = pg; + /* * Read in the page if it wasn't already present. Cannot optimize away * the whole page write case until jffs2_write_end can handle the @@ -220,15 +228,17 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, mutex_lock(&f->sem); ret = jffs2_do_readpage_nolock(inode, pg); mutex_unlock(&f->sem); - if (ret) - goto out_page; + if (ret) { + unlock_page(pg); + put_page(pg); + goto release_sem; + } } jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags); - return ret; -out_page: - unlock_page(pg); - put_page(pg); +release_sem: + mutex_unlock(&c->alloc_sem); +out_err: return ret; } diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 8f34daf85f7036a1573f78d8e95f27347cd7c639..5d5227ce4d91e8e48c61c4cc4b0f5c84baa44df1 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -168,7 +168,7 @@ struct cb_devicenotifyitem { }; struct cb_devicenotifyargs { - int ndevs; + uint32_t ndevs; struct cb_devicenotifyitem *devs; }; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index bcc51f131a49614c2590fea6bdc0faa3c09b11b7..868d66ed8bcf6e01a6d9aeb2e409d2f273c6acfc 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -364,7 +364,7 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp, struct cb_process_state *cps) { struct cb_devicenotifyargs *args = argp; - int i; + uint32_t i; __be32 res = 0; struct nfs_client *clp = cps->clp; struct nfs_server *server = NULL; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 57558a8d92e9ba4ae3214365d4314b06ebb6499b..76aa1b456c5243ef2510d062d0c419002b37c116 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -268,11 +268,9 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, void *argp) { struct cb_devicenotifyargs *args = argp; + uint32_t tmp, n, i; __be32 *p; __be32 status = 0; - u32 tmp; - int n, i; - args->ndevs = 0; /* Num of device notifications */ p = read_buf(xdr, sizeof(uint32_t)); @@ -281,7 +279,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, goto out; } n = ntohl(*p++); - if (n <= 0) + if (n == 0) goto out; if (n > ULONG_MAX / sizeof(*args->devs)) { status = htonl(NFS4ERR_BADXDR); @@ -339,19 +337,21 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, dev->cbd_immediate = 0; } - args->ndevs++; - dprintk("%s: type %d layout 0x%x immediate %d\n", __func__, dev->cbd_notify_type, dev->cbd_layout_type, dev->cbd_immediate); } + args->ndevs = n; + dprintk("%s: ndevs %d\n", __func__, args->ndevs); + return 0; +err: + kfree(args->devs); out: + args->devs = NULL; + args->ndevs = 0; dprintk("%s: status %d ndevs %d\n", __func__, ntohl(status), args->ndevs); return status; -err: - kfree(args->devs); - goto out; } static __be32 decode_sessionid(struct xdr_stream *xdr, diff --git a/fs/nfs/client.c b/fs/nfs/client.c index b8adf3467786e13c27ea7f4738eee7d954d17966..7d02dc52209da626f3a46b210d2f39b0ce774658 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -180,6 +180,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) INIT_LIST_HEAD(&clp->cl_superblocks); clp->cl_rpcclient = ERR_PTR(-EINVAL); + clp->cl_flags = cl_init->init_flags; clp->cl_proto = cl_init->proto; clp->cl_net = get_net(cl_init->net); @@ -427,7 +428,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) list_add_tail(&new->cl_share_link, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); - new->cl_flags = cl_init->init_flags; return rpc_ops->init_client(new, cl_init); } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 67ece39c7800e80810a1629fe0c8fe4c7d4aa4be..2ec602eccb80a0bc9c452ae7ae08a70b90e13ebd 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1640,14 +1640,14 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, if (!res) { inode = d_inode(dentry); if ((lookup_flags & LOOKUP_DIRECTORY) && inode && - !S_ISDIR(inode->i_mode)) + !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) res = ERR_PTR(-ENOTDIR); else if (inode && S_ISREG(inode->i_mode)) res = ERR_PTR(-EOPENSTALE); } else if (!IS_ERR(res)) { inode = d_inode(res); if ((lookup_flags & LOOKUP_DIRECTORY) && inode && - !S_ISDIR(inode->i_mode)) { + !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) { dput(res); res = ERR_PTR(-ENOTDIR); } else if (inode && S_ISREG(inode->i_mode)) { @@ -2052,6 +2052,8 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) trace_nfs_link_enter(inode, dir, dentry); d_drop(dentry); + if (S_ISREG(inode->i_mode)) + nfs_sync_inode(inode); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { ihold(inode); @@ -2140,6 +2142,8 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, } } + if (S_ISREG(old_inode->i_mode)) + nfs_sync_inode(old_inode); task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); if (IS_ERR(task)) { error = PTR_ERR(task); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6d5fedcd30d05d9e29967b83d0f273cc4b1e4599..db52430dfbd7e780ed25bc79a7a8b4a0d50b12c5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -787,12 +787,9 @@ int nfs_getattr(const struct path *path, struct kstat *stat, goto out_no_update; /* Flush out writes to the server in order to update c/mtime. */ - if ((request_mask & (STATX_CTIME|STATX_MTIME)) && - S_ISREG(inode->i_mode)) { - err = filemap_write_and_wait(inode->i_mapping); - if (err) - goto out; - } + if ((request_mask & (STATX_CTIME | STATX_MTIME)) && + S_ISREG(inode->i_mode)) + filemap_write_and_wait(inode->i_mapping); /* * We may force a getattr if the user cares about atime. diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 4f3b5add052b8ff550e88a862e815409ad19f379..d22176d87448a2670c4af158152ba793cd5fddf2 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -277,7 +277,8 @@ struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *, struct nfs_fh *, struct nfs_fattr *); int nfs4_replace_transport(struct nfs_server *server, const struct nfs4_fs_locations *locations); - +size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, + size_t salen, struct net *net); /* nfs4proc.c */ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *); extern int nfs4_async_handle_error(struct rpc_task *task, diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 36282e48cff64ef4774afa777bb27c3f99e7ae3c..60999b261666a0dfa086b1113bf066c123608e37 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -1276,8 +1276,11 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, } nfs_put_client(clp); - if (server->nfs_client->cl_hostname == NULL) + if (server->nfs_client->cl_hostname == NULL) { server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); + if (server->nfs_client->cl_hostname == NULL) + return -ENOMEM; + } nfs_server_insert_lists(server); return nfs_probe_destination(server); diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 24f06dcc2b08eeabd9f6b14591584cf8676749c8..936c412be28efe37823d10e2d6fc8fc763460627 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -121,8 +121,8 @@ static int nfs4_validate_fspath(struct dentry *dentry, return 0; } -static size_t nfs_parse_server_name(char *string, size_t len, - struct sockaddr *sa, size_t salen, struct net *net) +size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, + size_t salen, struct net *net) { ssize_t ret; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 5ff5161046993b02fd4a649ef2bb610c72d503a5..f53f8f17629b6075ab94b31bf436be7406586d3b 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2064,6 +2064,9 @@ static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred) } result = -NFS4ERR_NXIO; + if (!locations->nlocations) + goto out; + if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { dprintk("<-- %s: No fs_locations data, migration skipped\n", __func__); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index feebe80f20929d667346d3248af52bed5157ffc9..e8917fc393a4429ff11ddafa27b1b305b218dee4 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3753,8 +3753,6 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st if (unlikely(!p)) goto out_overflow; n = be32_to_cpup(p); - if (n <= 0) - goto out_eio; for (res->nlocations = 0; res->nlocations < n; res->nlocations++) { u32 m; struct nfs4_fs_location *loc; @@ -4300,10 +4298,11 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, } else printk(KERN_WARNING "%s: label too long (%u)!\n", __func__, len); + if (label && label->label) + dprintk("%s: label=%.*s, len=%d, PI=%d, LFS=%d\n", + __func__, label->len, (char *)label->label, + label->len, label->pi, label->lfs); } - if (label && label->label) - dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__, - (char *)label->label, label->len, label->pi, label->lfs); return status; out_overflow: diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 4028e04a7b0b411a43f7a1fac906778088adddc1..bf6c97ce6d8b42fd28415fb8c793d2dd8450dff9 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -398,6 +398,7 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected, */ take_dentry_name_snapshot(&name, real); this = lookup_one_len(name.name, connected, strlen(name.name)); + release_dentry_name_snapshot(&name); err = PTR_ERR(this); if (IS_ERR(this)) { goto fail; @@ -412,7 +413,6 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected, } out: - release_dentry_name_snapshot(&name); dput(parent); inode_unlock(dir); return this; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index cd1c94f77dee597170f16d39291eadadff02c5f1..63b7b2600d8198e5ea14c1a32d589fd0f40c6356 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -159,7 +159,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file_inode(file); struct fd real; const struct cred *old_cred; - ssize_t ret; + loff_t ret; /* * The two special cases below do not need to involve real fs, @@ -252,13 +252,19 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret) return ret; + ret = -EINVAL; + if (iocb->ki_flags & IOCB_DIRECT && + (!real.file->f_mapping->a_ops || + !real.file->f_mapping->a_ops->direct_IO)) + goto out_fdput; + old_cred = ovl_override_creds(file_inode(file)->i_sb); ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, ovl_iocb_to_rwf(iocb)); revert_creds(old_cred); ovl_file_accessed(file); - +out_fdput: fdput(real); return ret; @@ -286,6 +292,12 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret) goto out_unlock; + ret = -EINVAL; + if (iocb->ki_flags & IOCB_DIRECT && + (!real.file->f_mapping->a_ops || + !real.file->f_mapping->a_ops->direct_IO)) + goto out_fdput; + old_cred = ovl_override_creds(file_inode(file)->i_sb); file_start_write(real.file); ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, @@ -295,7 +307,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) /* Update size */ ovl_copyattr(ovl_inode_real(inode), inode); - +out_fdput: fdput(real); out_unlock: diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 6729999f4a66ce770252477fd3fc17de2b704168..45fe0c71acf62dccc21709b9d11a61d8ce8210f9 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -377,11 +377,20 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) static int ovl_remount(struct super_block *sb, int *flags, char *data) { struct ovl_fs *ofs = sb->s_fs_info; + struct super_block *upper_sb; + int ret = 0; if (!(*flags & SB_RDONLY) && ovl_force_readonly(ofs)) return -EROFS; - return 0; + if (*flags & SB_RDONLY && !sb_rdonly(sb)) { + upper_sb = ofs->upper_mnt->mnt_sb; + down_read(&upper_sb->s_umount); + ret = sync_filesystem(upper_sb); + up_read(&upper_sb->s_umount); + } + + return ret; } static const struct super_operations ovl_super_operations = { diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 1d1d393f4208ded44916a1e7d574564de616c1f3..ddb379abd919de21f9d55ad1f7e9fea1fa468273 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -687,9 +687,14 @@ int dquot_quota_sync(struct super_block *sb, int type) /* This is not very clever (and fast) but currently I don't know about * any other simple way of getting quota data to disk and we must get * them there for userspace to be visible... */ - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, 1); - sync_blockdev(sb->s_bdev); + if (sb->s_op->sync_fs) { + ret = sb->s_op->sync_fs(sb, 1); + if (ret) + return ret; + } + ret = sync_blockdev(sb->s_bdev); + if (ret) + return ret; /* * Now when everything is written we can discard the pagecache so diff --git a/fs/resctrlfs.c b/fs/resctrlfs.c index ea9df7d77b95a545d323b9c3aaff42d8000d44af..0e6753012140396bd6ed911f257b5ce66d7a31c6 100644 --- a/fs/resctrlfs.c +++ b/fs/resctrlfs.c @@ -989,7 +989,7 @@ static void resctrl_group_default_init(struct resctrl_group *r) r->type = RDTCTRL_GROUP; } -static int __init resctrl_group_setup_root(void) +static int resctrl_group_setup_root(void) { int ret; diff --git a/fs/select.c b/fs/select.c index b684f0dd6db864ca76450386d6612be0f126de9d..67e4be14c00c4217e58277926a954653d8165043 100644 --- a/fs/select.c +++ b/fs/select.c @@ -458,9 +458,11 @@ static int max_select_fd(unsigned long n, fd_set_bits *fds) return max; } -#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR) -#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR) -#define POLLEX_SET (EPOLLPRI) +#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\ + EPOLLNVAL) +#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\ + EPOLLNVAL) +#define POLLEX_SET (EPOLLPRI | EPOLLNVAL) static inline void wait_key_set(poll_table *wait, unsigned long in, unsigned long out, unsigned long bit, @@ -527,6 +529,7 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) break; if (!(bit & all_bits)) continue; + mask = EPOLLNVAL; f = fdget(i); if (f.file) { wait_key_set(wait, in, out, bit, @@ -534,34 +537,34 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) mask = vfs_poll(f.file, wait); fdput(f); - if ((mask & POLLIN_SET) && (in & bit)) { - res_in |= bit; - retval++; - wait->_qproc = NULL; - } - if ((mask & POLLOUT_SET) && (out & bit)) { - res_out |= bit; - retval++; - wait->_qproc = NULL; - } - if ((mask & POLLEX_SET) && (ex & bit)) { - res_ex |= bit; - retval++; - wait->_qproc = NULL; - } - /* got something, stop busy polling */ - if (retval) { - can_busy_loop = false; - busy_flag = 0; - - /* - * only remember a returned - * POLL_BUSY_LOOP if we asked for it - */ - } else if (busy_flag & mask) - can_busy_loop = true; - } + if ((mask & POLLIN_SET) && (in & bit)) { + res_in |= bit; + retval++; + wait->_qproc = NULL; + } + if ((mask & POLLOUT_SET) && (out & bit)) { + res_out |= bit; + retval++; + wait->_qproc = NULL; + } + if ((mask & POLLEX_SET) && (ex & bit)) { + res_ex |= bit; + retval++; + wait->_qproc = NULL; + } + /* got something, stop busy polling */ + if (retval) { + can_busy_loop = false; + busy_flag = 0; + + /* + * only remember a returned + * POLL_BUSY_LOOP if we asked for it + */ + } else if (busy_flag & mask) + can_busy_loop = true; + } if (res_in) *rinp = res_in; diff --git a/fs/super.c b/fs/super.c index 9fb4553c46e631d9879c31eed49bb02f70955d16..8dc26e23f1a3529b84cfe19c96d192992ffa1f1c 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1404,11 +1404,9 @@ static void lockdep_sb_freeze_acquire(struct super_block *sb) percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); } -static void sb_freeze_unlock(struct super_block *sb) +static void sb_freeze_unlock(struct super_block *sb, int level) { - int level; - - for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) + for (level--; level >= 0; level--) percpu_up_write(sb->s_writers.rw_sem + level); } @@ -1479,7 +1477,14 @@ int freeze_super(struct super_block *sb) sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ - sync_filesystem(sb); + ret = sync_filesystem(sb); + if (ret) { + sb->s_writers.frozen = SB_UNFROZEN; + sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT); + wake_up(&sb->s_writers.wait_unfrozen); + deactivate_locked_super(sb); + return ret; + } /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; @@ -1491,7 +1496,7 @@ int freeze_super(struct super_block *sb) printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; - sb_freeze_unlock(sb); + sb_freeze_unlock(sb, SB_FREEZE_FS); wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; @@ -1542,7 +1547,7 @@ static int thaw_super_locked(struct super_block *sb) } sb->s_writers.frozen = SB_UNFROZEN; - sb_freeze_unlock(sb); + sb_freeze_unlock(sb, SB_FREEZE_FS); out: wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 376bee94b5dd0b92ecc94cab2a92247a08adc88e..2f0c9610dabcec8a76caf25f042ac9ef816b567c 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -1882,7 +1882,8 @@ xfs_da3_path_shift( ASSERT(path != NULL); ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); level = (path->active-1) - 1; /* skip bottom layer in path */ - for (blk = &path->blk[level]; level >= 0; blk--, level--) { + for (; level >= 0; level--) { + blk = &path->blk[level]; node = blk->bp->b_addr; dp->d_ops->node_hdr_from_disk(&nodehdr, node); btree = dp->d_ops->node_tree_p(node); diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index f1260b4bfdeed62440cc238138e7fd405c2dccf1..17b6dde71bfe3825c3d97786770ef5485bc7dbfe 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -286,7 +286,11 @@ xchk_da_btree_block_check_sibling( /* Compare upper level pointer to sibling pointer. */ if (ds->state->altpath.blk[level].blkno != sibling) xchk_da_set_corrupt(ds, level); - xfs_trans_brelse(ds->dargs.trans, ds->state->altpath.blk[level].bp); + if (ds->state->altpath.blk[level].bp) { + xfs_trans_brelse(ds->dargs.trans, + ds->state->altpath.blk[level].bp); + ds->state->altpath.blk[level].bp = NULL; + } out: return error; } diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 4309cd07331a26779218302449071858427cd3e3..9b37a362de35102669479675b5a74e4d0d1772c4 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -605,6 +605,8 @@ xfs_buf_item_unlock( #if defined(DEBUG) || defined(XFS_WARN) bool ordered = bip->bli_flags & XFS_BLI_ORDERED; bool dirty = bip->bli_flags & XFS_BLI_DIRTY; + bool aborted = test_bit(XFS_LI_ABORTED, + &lip->li_flags); #endif trace_xfs_buf_item_unlock(bip); @@ -633,7 +635,7 @@ xfs_buf_item_unlock( released = xfs_buf_item_put(bip); if (hold || (stale && !released)) return; - ASSERT(!stale || test_bit(XFS_LI_ABORTED, &lip->li_flags)); + ASSERT(!stale || aborted); xfs_buf_relse(bp); } diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index a378a1d53fb99a700072027a5c04ebdda837293a..8d3eb27e68906b75cfbf920b1ddad41c3d59bee8 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -86,6 +86,13 @@ ARM_SMCCC_SMC_32, \ 0, 0x7fff) +#define ARM_SMCCC_ARCH_WORKAROUND_3 \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + 0, 0x3fff) + +#define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED 1 + #ifndef __ASSEMBLY__ #include diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 28ea02865ecc1fdec6203143ff08cfbdb520bb4c..1deaf36eb2371d7a39f737d0aca0e98628b97b91 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -28,6 +28,7 @@ #include #include +extern bool precise_iostat; struct module; struct scsi_ioctl_command; @@ -1011,6 +1012,8 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, extern void blk_rq_unprep_clone(struct request *rq); extern blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq); +extern blk_status_t __blk_insert_cloned_request(struct request_queue *q, + struct request *rq, bool precise); extern int blk_rq_append_bio(struct request *rq, struct bio **bio); extern void blk_delay_queue(struct request_queue *, unsigned long); extern void blk_queue_split(struct request_queue *, struct bio **); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c40fa25cf2ca359e2f3cf5527b6198b807c7c56c..30fcf812df9d37daa6d79e41d769a2f9faeb1c41 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -546,6 +546,11 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); int array_map_alloc_check(union bpf_attr *attr); +static inline bool unprivileged_ebpf_enabled(void) +{ + return !sysctl_unprivileged_bpf_disabled; +} + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -657,6 +662,12 @@ static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, { return ERR_PTR(-EOPNOTSUPP); } + +static inline bool unprivileged_ebpf_enabled(void) +{ + return false; +} + #endif /* CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, diff --git a/include/linux/if_arp.h b/include/linux/if_arp.h index e44746de95cdf9193c470d78382797465f39c5ba..c697a0524273522a9f6558f13b89a1bdd4d4b46d 100644 --- a/include/linux/if_arp.h +++ b/include/linux/if_arp.h @@ -55,6 +55,7 @@ static inline bool dev_is_mac_header_xmit(const struct net_device *dev) case ARPHRD_VOID: case ARPHRD_NONE: case ARPHRD_RAWIP: + case ARPHRD_PIMREG: return false; default: return true; diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 97a020c616ad1f693507e256cece8ef1ab81d87d..d17ea535836852e4e1227e66b0997ecfc37fb287 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1211,22 +1211,22 @@ * * @binder_set_context_mgr: * Check whether @mgr is allowed to be the binder context manager. - * @mgr contains the task_struct for the task being registered. + * @mgr contains the struct cred for the current binder process. * Return 0 if permission is granted. * @binder_transaction: * Check whether @from is allowed to invoke a binder transaction call * to @to. - * @from contains the task_struct for the sending task. - * @to contains the task_struct for the receiving task. + * @from contains the struct cred for the sending process. + * @to contains the struct cred for the receiving process. * @binder_transfer_binder: * Check whether @from is allowed to transfer a binder reference to @to. - * @from contains the task_struct for the sending task. - * @to contains the task_struct for the receiving task. + * @from contains the struct cred for the sending process. + * @to contains the struct cred for the receiving process. * @binder_transfer_file: * Check whether @from is allowed to transfer @file to @to. - * @from contains the task_struct for the sending task. + * @from contains the struct cred for the sending process. * @file contains the struct file being transferred. - * @to contains the task_struct for the receiving task. + * @to contains the struct cred for the receiving process. * * @ptrace_access_check: * Check permission before allowing the current process to trace the @@ -1428,13 +1428,13 @@ * */ union security_list_options { - int (*binder_set_context_mgr)(struct task_struct *mgr); - int (*binder_transaction)(struct task_struct *from, - struct task_struct *to); - int (*binder_transfer_binder)(struct task_struct *from, - struct task_struct *to); - int (*binder_transfer_file)(struct task_struct *from, - struct task_struct *to, + int (*binder_set_context_mgr)(const struct cred *mgr); + int (*binder_transaction)(const struct cred *from, + const struct cred *to); + int (*binder_transfer_binder)(const struct cred *from, + const struct cred *to); + int (*binder_transfer_file)(const struct cred *from, + const struct cred *to, struct file *file); int (*ptrace_access_check)(struct task_struct *child, diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index ba5d41edbc44d9a5b7573ac065062fef91a247cb..6d57c36fb676281563fcf457e72d41bb6b5ac42d 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -21,9 +21,9 @@ extern bool shmem_reliable; extern struct percpu_counter reliable_shmem_used_nr_page; extern bool pagecache_use_reliable_mem; DECLARE_PER_CPU(long, nr_reliable_buddy_pages); -DECLARE_PER_CPU(long, pagecache_reliable_pages); -DECLARE_PER_CPU(long, anon_reliable_pages); -extern unsigned long nr_reliable_reserve_pages __read_mostly; + +extern struct percpu_counter pagecache_reliable_pages; +extern struct percpu_counter anon_reliable_pages; extern long shmem_reliable_nr_page __read_mostly; extern void add_reliable_mem_size(long sz); @@ -43,6 +43,7 @@ extern void reliable_lru_add(enum lru_list lru, struct page *page, extern void page_cache_prepare_alloc(gfp_t *gfp); extern void reliable_lru_add_batch(int zid, enum lru_list lru, int val); +extern bool mem_reliable_counter_initialized(void); static inline bool mem_reliable_is_enabled(void) { @@ -81,13 +82,10 @@ static inline void reliable_page_counter(struct page *page, static inline bool reliable_mem_limit_check(unsigned long nr_page) { - int cpu; - long num = 0; + s64 num; - for_each_possible_cpu(cpu) { - num += per_cpu(pagecache_reliable_pages, cpu); - num += per_cpu(anon_reliable_pages, cpu); - } + num = percpu_counter_read_positive(&pagecache_reliable_pages); + num += percpu_counter_read_positive(&anon_reliable_pages); return num + nr_page <= task_reliable_limit / PAGE_SIZE; } @@ -119,18 +117,6 @@ static inline void mem_reliable_buddy_counter(struct page *page, int nr_page) this_cpu_add(nr_reliable_buddy_pages, nr_page); } -/* reserve mirrored memory for kernel usage */ -static inline bool mem_reliable_watermark_ok(int nr_page) -{ - long sum = 0; - int cpu; - - for_each_possible_cpu(cpu) - sum += per_cpu(nr_reliable_buddy_pages, cpu); - - return sum > nr_reliable_reserve_pages; -} - static inline bool mem_reliable_shmem_limit_check(void) { return percpu_counter_read_positive(&reliable_shmem_used_nr_page) < @@ -176,7 +162,6 @@ static inline void shmem_reliable_page_counter(struct page *page, int nr_page) static inline bool pagecache_reliable_is_enabled(void) { return false; } static inline bool mem_reliable_status(void) { return false; } static inline void mem_reliable_buddy_counter(struct page *page, int nr_page) {} -static inline bool mem_reliable_watermark_ok(int nr_page) { return true; } static inline bool mem_reliable_shmem_limit_check(void) { return true; } static inline void reliable_lru_add(enum lru_list lru, struct page *page, @@ -184,6 +169,8 @@ static inline void reliable_lru_add(enum lru_list lru, static inline void page_cache_prepare_alloc(gfp_t *gfp) {} static inline void reliable_lru_add_batch(int zid, enum lru_list lru, int val) {} + +static inline bool mem_reliable_counter_initialized(void) { return false; } #endif #endif diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 00437ae80b6f276b6956fd6aaeaf1559f1a6a2d4..17ec6bb919d7808e31a6a0253cf1cc49e0b5a43f 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -219,6 +219,9 @@ static inline bool vma_migratable(struct vm_area_struct *vma) extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); extern void mpol_put_task_policy(struct task_struct *); +extern long __do_mbind(unsigned long start, unsigned long len, + unsigned short mode, unsigned short mode_flags, + nodemask_t *nmask, unsigned long flags, struct mm_struct *mm); #else struct mempolicy {}; @@ -322,5 +325,12 @@ static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, static inline void mpol_put_task_policy(struct task_struct *task) { } + +static long __do_mbind(unsigned long start, unsigned long len, + unsigned short mode, unsigned short mode_flags, + nodemask_t *nmask, unsigned long flags, struct mm_struct *mm) +{ + return 0; +} #endif /* CONFIG_NUMA */ #endif diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9a7d7e6304f4425270d15ca5252868cc362b04e4..752df39590bacd2a34cf8807d123c73e0a381cce 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3790,7 +3790,8 @@ void netdev_run_todo(void); */ static inline void dev_put(struct net_device *dev) { - this_cpu_dec(*dev->pcpu_refcnt); + if (dev) + this_cpu_dec(*dev->pcpu_refcnt); } /** @@ -3801,7 +3802,8 @@ static inline void dev_put(struct net_device *dev) */ static inline void dev_hold(struct net_device *dev) { - this_cpu_inc(*dev->pcpu_refcnt); + if (dev) + this_cpu_inc(*dev->pcpu_refcnt); } /* Carrier loss detection, dial on demand. The functions netif_carrier_on diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 5225832bd6ff11415422470c5c25e60c1dad6821..bb9cb84114c151059ccac74ca07a113ea22c347a 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -6,6 +6,7 @@ #include #include #include +#include #include extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); @@ -34,6 +35,7 @@ extern void rtnl_unlock(void); extern int rtnl_trylock(void); extern int rtnl_is_locked(void); extern int rtnl_lock_killable(void); +extern bool refcount_dec_and_rtnl_lock(refcount_t *r); extern wait_queue_head_t netdev_unregistering_wq; extern struct rw_semaphore pernet_ops_rwsem; @@ -83,6 +85,11 @@ static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) return rtnl_dereference(dev->ingress_queue); } +static inline struct netdev_queue *dev_ingress_queue_rcu(struct net_device *dev) +{ + return rcu_dereference(dev->ingress_queue); +} + struct netdev_queue *dev_ingress_queue_create(struct net_device *dev); #ifdef CONFIG_NET_INGRESS diff --git a/include/linux/sched.h b/include/linux/sched.h index 0202fe00731820de52cac6145c58b70d5d3da4e2..ec7418ff12d4265cfcdef4e55b25c658862f76a1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1432,7 +1432,6 @@ extern struct pid *cad_pid; #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ -#define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_FROZEN 0x00010000 /* Frozen for system suspend */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index c8b52b3ec86543bc452d85447e6f9544703d90d4..1c2c099e393b0f2509e847e6aa7a66862b97be5f 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -33,7 +33,7 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); -extern void sched_post_fork(struct task_struct *p); +extern void sched_cgroup_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); diff --git a/include/linux/security.h b/include/linux/security.h index 75f4156c84d7d251cfc5a041a7a149b1d7588680..9eb1c7a0f280b264ba1dd9a4371073df652e00d4 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -216,13 +216,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) extern int security_init(void); /* Security operations */ -int security_binder_set_context_mgr(struct task_struct *mgr); -int security_binder_transaction(struct task_struct *from, - struct task_struct *to); -int security_binder_transfer_binder(struct task_struct *from, - struct task_struct *to); -int security_binder_transfer_file(struct task_struct *from, - struct task_struct *to, struct file *file); +int security_binder_set_context_mgr(const struct cred *mgr); +int security_binder_transaction(const struct cred *from, + const struct cred *to); +int security_binder_transfer_binder(const struct cred *from, + const struct cred *to); +int security_binder_transfer_file(const struct cred *from, + const struct cred *to, struct file *file); int security_ptrace_access_check(struct task_struct *child, unsigned int mode); int security_ptrace_traceme(struct task_struct *parent); int security_capget(struct task_struct *target, @@ -439,25 +439,25 @@ static inline int security_init(void) return 0; } -static inline int security_binder_set_context_mgr(struct task_struct *mgr) +static inline int security_binder_set_context_mgr(const struct cred *mgr) { return 0; } -static inline int security_binder_transaction(struct task_struct *from, - struct task_struct *to) +static inline int security_binder_transaction(const struct cred *from, + const struct cred *to) { return 0; } -static inline int security_binder_transfer_binder(struct task_struct *from, - struct task_struct *to) +static inline int security_binder_transfer_binder(const struct cred *from, + const struct cred *to) { return 0; } -static inline int security_binder_transfer_file(struct task_struct *from, - struct task_struct *to, +static inline int security_binder_transfer_file(const struct cred *from, + const struct cred *to, struct file *file) { return 0; diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 669d32f9a092610b41445d170a0996b9efce66ec..ba001069fbd25ff12902aefc4fdf7720988029f3 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -13,6 +13,7 @@ #define SP_HUGEPAGE_ONLY (1 << 1) #define SP_DVPP (1 << 2) #define SP_SPEC_NODE_ID (1 << 3) +#define SP_PROT_RO (1 << 16) #define DEVICE_ID_BITS 4UL #define DEVICE_ID_MASK ((1UL << DEVICE_ID_BITS) - 1UL) @@ -22,7 +23,7 @@ #define NODE_ID_SHIFT (DEVICE_ID_SHIFT + DEVICE_ID_BITS) #define SP_FLAG_MASK (SP_HUGEPAGE | SP_HUGEPAGE_ONLY | SP_DVPP | \ - SP_SPEC_NODE_ID | \ + SP_SPEC_NODE_ID | SP_PROT_RO | \ (DEVICE_ID_MASK << DEVICE_ID_SHIFT) | \ (NODE_ID_MASK << NODE_ID_SHIFT)) diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 10bcbea6e952e9abe209840d5b142d36461fce69..8aa865bce4f60cd7735d2f2f0e83ece23c071b36 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -144,6 +144,7 @@ struct rpc_add_xprt_test { #define RPC_CLNT_CREATE_NO_IDLE_TIMEOUT (1UL << 8) #define RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT (1UL << 9) #define RPC_CLNT_CREATE_REUSEPORT (1UL << 11) +#define RPC_CLNT_CREATE_CONNECTED (1UL << 12) struct rpc_clnt *rpc_create(struct rpc_create_args *args); struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 3e2ee7739c11a1f448962eefac876b55282e0bf0..a7c197299fc72a5645e16b2632e3917d478995dc 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -10,6 +10,9 @@ static inline int virtio_net_hdr_set_proto(struct sk_buff *skb, const struct virtio_net_hdr *hdr) { + if (skb->protocol) + return 0; + switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: case VIRTIO_NET_HDR_GSO_UDP: diff --git a/include/net/ax25.h b/include/net/ax25.h index 8b7eb46ad72d8804c1ffaa3943bb2816113239d8..aadff553e4b734aa6af206ca65f64fff5119eba9 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -236,6 +236,7 @@ typedef struct ax25_dev { #if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) ax25_dama_info dama; #endif + refcount_t refcount; } ax25_dev; typedef struct ax25_cb { @@ -290,6 +291,17 @@ static __inline__ void ax25_cb_put(ax25_cb *ax25) } } +static inline void ax25_dev_hold(ax25_dev *ax25_dev) +{ + refcount_inc(&ax25_dev->refcount); +} + +static inline void ax25_dev_put(ax25_dev *ax25_dev) +{ + if (refcount_dec_and_test(&ax25_dev->refcount)) { + kfree(ax25_dev); + } +} static inline __be16 ax25_type_trans(struct sk_buff *skb, struct net_device *dev) { skb->dev = dev; diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index fc3111515f5cbec0a230cfd242c5a8ba9dc24571..732bc3b4606bf7df8c7d96fda16fc173d48c1d50 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -265,7 +265,7 @@ struct ad_system { struct ad_bond_info { struct ad_system system; /* 802.3ad system structure */ - u32 agg_select_timer; /* Timer to select aggregator after all adapter's hand shakes */ + atomic_t agg_select_timer; /* Timer to select aggregator after all adapter's hand shakes */ u16 aggregator_identifier; }; diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 14efa0ded75dd93d6f63735a95feaf72de6fdef7..adab27ba1ecbf054f5284e4342b1551b104d9a2c 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -123,8 +123,20 @@ static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb) memcpy(&new_md->u.tun_info, &md_dst->u.tun_info, sizeof(struct ip_tunnel_info) + md_size); +#ifdef CONFIG_DST_CACHE + /* Unclone the dst cache if there is one */ + if (new_md->u.tun_info.dst_cache.cache) { + int ret; + + ret = dst_cache_init(&new_md->u.tun_info.dst_cache, GFP_ATOMIC); + if (ret) { + metadata_dst_free(new_md); + return ERR_PTR(ret); + } + } +#endif + skb_dst_drop(skb); - dst_hold(&new_md->dst); skb_dst_set(skb, &new_md->dst); return new_md; } diff --git a/include/net/esp.h b/include/net/esp.h index 117652eb6ea32a836cd269f01e70f0e3c990fbc7..465e38890ee986023fd7502d0abeb80bca3182a5 100644 --- a/include/net/esp.h +++ b/include/net/esp.h @@ -4,6 +4,8 @@ #include +#define ESP_SKB_FRAG_MAXSIZE (PAGE_SIZE << SKB_FRAG_PAGE_ORDER) + struct ip_esp_hdr; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) diff --git a/include/net/ip.h b/include/net/ip.h index 17766586dfa09869e3b0b0ccefefe3effc1d08dc..621eeb9f142e66e6160481bcdb4674de22fbabaa 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -437,19 +437,18 @@ static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb, { struct iphdr *iph = ip_hdr(skb); + /* We had many attacks based on IPID, use the private + * generator as much as we can. + */ + if (sk && inet_sk(sk)->inet_daddr) { + iph->id = htons(inet_sk(sk)->inet_id); + inet_sk(sk)->inet_id += segs; + return; + } if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) { - /* This is only to work around buggy Windows95/2000 - * VJ compression implementations. If the ID field - * does not change, they drop every other packet in - * a TCP stream using header compression. - */ - if (sk && inet_sk(sk)->inet_daddr) { - iph->id = htons(inet_sk(sk)->inet_id); - inet_sk(sk)->inet_id += segs; - } else { - iph->id = 0; - } + iph->id = 0; } else { + /* Unfortunately we need the big hammer to get a suitable IPID */ __ip_select_ident(net, iph, segs); } } diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 62c936230cc8bb290d0f201ed7a5bca9d74800ea..b4fea9dd589d4e1524d75a59bbc378bfa968fb62 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -243,7 +243,7 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, fn = rcu_dereference(f6i->fib6_node); if (fn) { - *cookie = fn->fn_sernum; + *cookie = READ_ONCE(fn->fn_sernum); /* pairs with smp_wmb() in fib6_update_sernum_upto_root() */ smp_rmb(); status = true; diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index a50a69f5334c8647d129ac1edce909edfc442691..861414a62ce152b128706ee0b59b636c20c56fc4 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -32,7 +32,7 @@ void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *q void nf_unregister_queue_handler(struct net *net); void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict); -void nf_queue_entry_get_refs(struct nf_queue_entry *entry); +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry); void nf_queue_entry_release_refs(struct nf_queue_entry *entry); static inline void init_hashrandom(u32 *jhash_initval) diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index edca90ef3bdc4e4aacf83c691e09ef214e58a1b5..1a6ac924266dbd97f1e1be6acdff5500cbb01c6a 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -103,6 +103,7 @@ int qdisc_set_default(const char *id); void qdisc_hash_add(struct Qdisc *q, bool invisible); void qdisc_hash_del(struct Qdisc *q); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle); +struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle); struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab, struct netlink_ext_ack *extack); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index c9cd5086bd544e1be75b594fa34d8c5113f6526e..f5f45390828d0674426b8d09f7061a2b7c84df5a 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -93,7 +93,13 @@ struct Qdisc { struct gnet_stats_queue __percpu *cpu_qstats; int padded; refcount_t refcnt; - +#ifndef __GENKSYMS__ + /* To adapt to KABI, put rcu before gso_skb for 64-bit kernel. + * RCU will use 16 Bytes, and the space is enough. It's unuse + * for 32-bit kernel. + */ + struct rcu_head rcu; +#endif /* * For performance sake on SMP, we put highly modified fields at the end */ @@ -117,6 +123,19 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc) refcount_inc(&qdisc->refcnt); } +/* Intended to be used by unlocked users, when concurrent qdisc release is + * possible. + */ + +static inline struct Qdisc *qdisc_refcount_inc_nz(struct Qdisc *qdisc) +{ + if (qdisc->flags & TCQ_F_BUILTIN) + return qdisc; + if (refcount_inc_not_zero(&qdisc->refcnt)) + return qdisc; + return NULL; +} + static inline bool qdisc_is_running(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) diff --git a/include/net/sock.h b/include/net/sock.h index 803464e66e02cd7d6f325726ebbb3f39aee3742b..b001ddbed3f39f3bb9be5332981cc32267cf8966 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2584,6 +2584,8 @@ extern int sysctl_optmem_max; extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; +#define SKB_FRAG_PAGE_ORDER get_order(32768) + static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_wmem ? */ diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h index defdf92911c3ac92fa3875e9efe4f124a6bb1580..028b4e84618545386a0b4cecd39b8f7c8e95cd61 100644 --- a/include/uapi/asm-generic/mman.h +++ b/include/uapi/asm-generic/mman.h @@ -21,6 +21,8 @@ #define MAP_REPLACE 0x1000000 #endif +#define MAP_ALIGN 0x2000000 /* create an aligned mapping */ + /* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */ #define MCL_CURRENT 1 /* lock all current mappings */ diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 5f3b9fec7b5f4491ad9f38beea7447a305ff4fb0..57d7d5a99c9dab8a292540581b6a029041cc0bf2 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -504,6 +504,12 @@ struct xfrm_user_offload { int ifindex; __u8 flags; }; +/* This flag was exposed without any kernel code that supporting it. + * Unfortunately, strongswan has the code that uses sets this flag, + * which makes impossible to reuse this bit. + * + * So leave it here to make sure that it won't be reused by mistake. + */ #define XFRM_OFFLOAD_IPV6 1 #define XFRM_OFFLOAD_INBOUND 2 diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index a9978350b45b0bf24290771568f03f1c7f93da3f..a58a89cc0e97d5e6b1f36dc349971ee4e4775902 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -97,17 +97,32 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly); * access has been ended, free the given page too. Access will be ended * immediately iff the grant entry is not in use, otherwise it will happen * some time later. page may be 0, in which case no freeing will occur. + * Note that the granted page might still be accessed (read or write) by the + * other side after gnttab_end_foreign_access() returns, so even if page was + * specified as 0 it is not allowed to just reuse the page for other + * purposes immediately. gnttab_end_foreign_access() will take an additional + * reference to the granted page in this case, which is dropped only after + * the grant is no longer in use. + * This requires that multi page allocations for areas subject to + * gnttab_end_foreign_access() are done via alloc_pages_exact() (and freeing + * via free_pages_exact()) in order to avoid high order pages. */ void gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page); +/* + * End access through the given grant reference, iff the grant entry is + * no longer in use. In case of success ending foreign access, the + * grant reference is deallocated. + * Return 1 if the grant entry was freed, 0 if it is still in use. + */ +int gnttab_try_end_foreign_access(grant_ref_t ref); + int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn); unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref); unsigned long gnttab_end_foreign_transfer(grant_ref_t ref); -int gnttab_query_foreign_access(grant_ref_t ref); - /* * operations on reserved batches of grant references */ diff --git a/init/Kconfig b/init/Kconfig index 1a0b15c5a82b977ba5af765e4657c1f4521ce869..cef0e5fdc9014f26504a22a0cf33f4b3e1f7c71b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1539,6 +1539,16 @@ config BPF_JIT_ALWAYS_ON Enables BPF JIT and removes BPF interpreter to avoid speculative execution of BPF instructions by the interpreter +config BPF_UNPRIV_DEFAULT_OFF + bool "Disable unprivileged BPF by default" + depends on BPF_SYSCALL + help + Disables unprivileged BPF by default by setting the corresponding + /proc/sys/kernel/unprivileged_bpf_disabled knob to 2. An admin can + still reenable it by setting it to 0 later on, or permanently + disable it by setting it to 1 (from which no other transition to + 0 is possible anymore). + config USERFAULTFD bool "Enable userfaultfd() system call" select ANON_INODES diff --git a/kernel/async.c b/kernel/async.c index a893d6170944f7264c63e1e890420c9ecd0e9855..4bf1b00a28d864628a9e284b3cb5f46aa85b4e4e 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -191,9 +191,6 @@ static async_cookie_t __async_schedule(async_func_t func, void *data, struct asy atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - /* mark that this task has queued an async job, used by module init */ - current->flags |= PF_USED_ASYNC; - /* schedule for execution */ queue_work(system_unbound_wq, &entry->work); diff --git a/kernel/audit.c b/kernel/audit.c index 3de5ebb94559216d43908ac0cc5a0c3c7cfb9a01..7dc14a4d9e3cf4b32de6e7b02352fdf0520af3c9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -549,20 +549,22 @@ static void kauditd_printk_skb(struct sk_buff *skb) /** * kauditd_rehold_skb - Handle a audit record send failure in the hold queue * @skb: audit record + * @error: error code (unused) * * Description: * This should only be used by the kauditd_thread when it fails to flush the * hold queue. */ -static void kauditd_rehold_skb(struct sk_buff *skb) +static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error) { - /* put the record back in the queue at the same place */ - skb_queue_head(&audit_hold_queue, skb); + /* put the record back in the queue */ + skb_queue_tail(&audit_hold_queue, skb); } /** * kauditd_hold_skb - Queue an audit record, waiting for auditd * @skb: audit record + * @error: error code * * Description: * Queue the audit record, waiting for an instance of auditd. When this @@ -572,19 +574,31 @@ static void kauditd_rehold_skb(struct sk_buff *skb) * and queue it, if we have room. If we want to hold on to the record, but we * don't have room, record a record lost message. */ -static void kauditd_hold_skb(struct sk_buff *skb) +static void kauditd_hold_skb(struct sk_buff *skb, int error) { /* at this point it is uncertain if we will ever send this to auditd so * try to send the message via printk before we go any further */ kauditd_printk_skb(skb); /* can we just silently drop the message? */ - if (!audit_default) { - kfree_skb(skb); - return; + if (!audit_default) + goto drop; + + /* the hold queue is only for when the daemon goes away completely, + * not -EAGAIN failures; if we are in a -EAGAIN state requeue the + * record on the retry queue unless it's full, in which case drop it + */ + if (error == -EAGAIN) { + if (!audit_backlog_limit || + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_retry_queue, skb); + return; + } + audit_log_lost("kauditd retry queue overflow"); + goto drop; } - /* if we have room, queue the message */ + /* if we have room in the hold queue, queue the message */ if (!audit_backlog_limit || skb_queue_len(&audit_hold_queue) < audit_backlog_limit) { skb_queue_tail(&audit_hold_queue, skb); @@ -593,24 +607,32 @@ static void kauditd_hold_skb(struct sk_buff *skb) /* we have no other options - drop the message */ audit_log_lost("kauditd hold queue overflow"); +drop: kfree_skb(skb); } /** * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd * @skb: audit record + * @error: error code (unused) * * Description: * Not as serious as kauditd_hold_skb() as we still have a connected auditd, * but for some reason we are having problems sending it audit records so * queue the given record and attempt to resend. */ -static void kauditd_retry_skb(struct sk_buff *skb) +static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error) { - /* NOTE: because records should only live in the retry queue for a - * short period of time, before either being sent or moved to the hold - * queue, we don't currently enforce a limit on this queue */ - skb_queue_tail(&audit_retry_queue, skb); + if (!audit_backlog_limit || + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_retry_queue, skb); + return; + } + + /* we have to drop the record, send it via printk as a last effort */ + kauditd_printk_skb(skb); + audit_log_lost("kauditd retry queue overflow"); + kfree_skb(skb); } /** @@ -648,7 +670,7 @@ static void auditd_reset(const struct auditd_connection *ac) /* flush the retry queue to the hold queue, but don't touch the main * queue since we need to process that normally for multicast */ while ((skb = skb_dequeue(&audit_retry_queue))) - kauditd_hold_skb(skb); + kauditd_hold_skb(skb, -ECONNREFUSED); } /** @@ -722,16 +744,18 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, struct sk_buff_head *queue, unsigned int retry_limit, void (*skb_hook)(struct sk_buff *skb), - void (*err_hook)(struct sk_buff *skb)) + void (*err_hook)(struct sk_buff *skb, int error)) { int rc = 0; - struct sk_buff *skb; + struct sk_buff *skb = NULL; + struct sk_buff *skb_tail; unsigned int failed = 0; /* NOTE: kauditd_thread takes care of all our locking, we just use * the netlink info passed to us (e.g. sk and portid) */ - while ((skb = skb_dequeue(queue))) { + skb_tail = skb_peek_tail(queue); + while ((skb != skb_tail) && (skb = skb_dequeue(queue))) { /* call the skb_hook for each skb we touch */ if (skb_hook) (*skb_hook)(skb); @@ -739,9 +763,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, /* can we send to anyone via unicast? */ if (!sk) { if (err_hook) - (*err_hook)(skb); - if (queue == &audit_hold_queue) - goto out; + (*err_hook)(skb, -ECONNREFUSED); continue; } @@ -755,11 +777,9 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, rc == -ECONNREFUSED || rc == -EPERM) { sk = NULL; if (err_hook) - (*err_hook)(skb); + (*err_hook)(skb, rc); if (rc == -EAGAIN) rc = 0; - if (queue == &audit_hold_queue) - goto out; /* continue to drain the queue */ continue; } else @@ -771,7 +791,6 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, } } -out: return (rc >= 0 ? 0 : rc); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 30253fb4254e42047b6e95bb984c5c9496286c86..b11852ea1822864415859f7f6d7c4372d949d6ce 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -48,7 +48,8 @@ static DEFINE_SPINLOCK(prog_idr_lock); static DEFINE_IDR(map_idr); static DEFINE_SPINLOCK(map_idr_lock); -int sysctl_unprivileged_bpf_disabled __read_mostly; +int sysctl_unprivileged_bpf_disabled __read_mostly = + IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0; static const struct bpf_map_ops * const bpf_map_types[] = { #define BPF_PROG_TYPE(_id, _ops) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index feb91177247c8deca3d59dac325db4712b1b98a2..3544ee391350c13909e409de98b943140fdfcbe5 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1532,6 +1532,11 @@ static void cpuset_attach(struct cgroup_taskset *tset) mutex_lock(&cpuset_mutex); + /* + * It should hold cpus lock because a cpu offline event can + * cause set_cpus_allowed_ptr() failed. + */ + get_online_cpus(); /* prepare for attach */ if (cs == &top_cpuset) cpumask_copy(cpus_attach, cpu_possible_mask); @@ -1550,6 +1555,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flag(cs, task); } + put_online_cpus(); /* * Change mm for all threadgroup leaders. This is expensive and may diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 2a8c41f12d45064e07d6dc88bd84ca97e234f492..694e7ccea659823885d640df5e87f2eaa0010966 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -583,10 +583,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, */ for (i = 0; i < nslots; i++) io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); - + /* + * When dir == DMA_FROM_DEVICE we could omit the copy from the orig + * to the tlb buffer, if we knew for sure the device will + * overwirte the entire current content. But we don't. Thus + * unconditional bounce may prevent leaking swiotlb content (i.e. + * kernel memory) to user-space. + */ + swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); return tlb_addr; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 00b22c820d1f3503d932dc65dd187fb03610654a..8dc07a529e6d04a6cb98048afb9f73da871e50ae 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -798,7 +798,7 @@ static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); */ static void perf_cgroup_switch(struct task_struct *task, int mode) { - struct perf_cpu_context *cpuctx; + struct perf_cpu_context *cpuctx, *tmp; struct list_head *list; unsigned long flags; @@ -809,7 +809,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) local_irq_save(flags); list = this_cpu_ptr(&cgrp_cpuctx_list); - list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) { + list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) { WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); perf_ctx_lock(cpuctx, cpuctx->task_ctx); @@ -2135,28 +2135,6 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla event_function_call(event, __perf_remove_from_context, (void *)flags); - /* - * This is as passable as any hw.target handling out there; - * hw.target implies task context, therefore, no migration. - * Which together with DETACH_GROUP means that this is the - * final remove_from_context of a task event. - */ - if (event->hw.target && (flags & DETACH_GROUP)) { - /* - * Now, the problem with, say uprobes, is that they - * use hw.target for context in their ->destroy() - * callbacks. Supposedly, they may need to poke at - * its contents, so better call it while we still - * have the task. - */ - if (event->destroy) { - event->destroy(event); - event->destroy = NULL; - } - put_task_struct(event->hw.target); - event->hw.target = NULL; - } - /* * The above event_function_call() can NO-OP when it hits * TASK_TOMBSTONE. In that case we must already have been detached diff --git a/kernel/fork.c b/kernel/fork.c index 88463fd56930820139346d994a32d865439b313a..231b01eba6e1ea9e3e2237ec6169bde123e0e774 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2063,6 +2063,17 @@ static __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_free_futex_mutex; + /* + * Now that the cgroups are pinned, re-clone the parent cgroup and put + * the new task on the correct runqueue. All this *before* the task + * becomes visible. + * + * This isn't part of ->can_fork() because while the re-cloning is + * cgroup specific, it unconditionally needs to place the task on a + * runqueue. + */ + sched_cgroup_fork(p); + /* * From this point on we must avoid any synchronous user-space * communication until we take the tasklist-lock. In particular, we do @@ -2171,7 +2182,6 @@ static __latent_entropy struct task_struct *copy_process( proc_fork_connector(p); cgroup_post_fork(p); - sched_post_fork(p); cgroup_threadgroup_change_end(current); perf_event_fork(p); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 66a7c4befa0ee67750a54e168588898ad80ec754..89b31e8274255677e7d77e87f85da154a1134d53 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1010,12 +1010,6 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) int ret; #endif - if (!func->old_name || !func->new_func) - return -EINVAL; - - if (strlen(func->old_name) >= KSYM_NAME_LEN) - return -EINVAL; - #ifdef CONFIG_LIVEPATCH_WO_FTRACE ret = arch_klp_func_can_patch(func); if (ret) @@ -1105,6 +1099,16 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) if (klp_is_module(obj) && strlen(obj->name) >= MODULE_NAME_LEN) return -EINVAL; + klp_for_each_func(obj, func) { + if (!func->old_name || !func->new_func) { + pr_err("old_name or new_func is invalid\n"); + return -EINVAL; + } + if (strlen(func->old_name) >= KSYM_NAME_LEN) { + pr_err("old_name is too long\n"); + return -EINVAL; + } + } obj->patched = false; obj->mod = NULL; diff --git a/kernel/module.c b/kernel/module.c index 38c583d54defb733cca928c2c0d6eaca61971fbb..591c176ac6a2621245e05be9451074fa90e3d9be 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3493,12 +3493,6 @@ static noinline int do_init_module(struct module *mod) } freeinit->module_init = mod->init_layout.base; - /* - * We want to find out whether @mod uses async during init. Clear - * PF_USED_ASYNC. async_schedule*() will set it. - */ - current->flags &= ~PF_USED_ASYNC; - do_mod_ctors(mod); /* Start the module */ if (mod->init != NULL) @@ -3524,22 +3518,13 @@ static noinline int do_init_module(struct module *mod) /* * We need to finish all async code before the module init sequence - * is done. This has potential to deadlock. For example, a newly - * detected block device can trigger request_module() of the - * default iosched from async probing task. Once userland helper - * reaches here, async_synchronize_full() will wait on the async - * task waiting on request_module() and deadlock. - * - * This deadlock is avoided by perfomring async_synchronize_full() - * iff module init queued any async jobs. This isn't a full - * solution as it will deadlock the same if module loading from - * async jobs nests more than once; however, due to the various - * constraints, this hack seems to be the best option for now. - * Please refer to the following thread for details. + * is done. This has potential to deadlock if synchronous module + * loading is requested from async (which is not allowed!). * - * http://thread.gmane.org/gmane.linux.kernel/1420814 + * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous + * request_module() from async workers") for more details. */ - if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC)) + if (!mod->async_probe_requested) async_synchronize_full(); ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base + diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 4210152e56f0f403c69f1539cc011ae807e5eee0..aad7c8fcb22fe1d745bab01e60c7a4e63672aa75 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -39,23 +39,19 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active) { struct rb_node *node; struct wakelock *wl; - char *str = buf; - char *end = buf + PAGE_SIZE; + int len = 0; mutex_lock(&wakelocks_lock); for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { wl = rb_entry(node, struct wakelock, node); if (wl->ws.active == show_active) - str += scnprintf(str, end - str, "%s ", wl->name); + len += sysfs_emit_at(buf, len, "%s ", wl->name); } - if (str > buf) - str--; - - str += scnprintf(str, end - str, "\n"); + len += sysfs_emit_at(buf, len, "\n"); mutex_unlock(&wakelocks_lock); - return (str - buf); + return len; } #if CONFIG_PM_WAKELOCKS_LIMIT > 0 diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 0fe45941b5c7e809c90922fc89c22b81d8f09ccc..c645a7221d0b22e0ad2c6f0c49fd222e9031a499 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2423,9 +2423,9 @@ void console_unlock(void) printk_safe_enter_irqsave(flags); raw_spin_lock(&logbuf_lock); if (console_seq < log_first_seq) { - len = sprintf(text, - "** %llu printk messages dropped **\n", - log_first_seq - console_seq); + len = snprintf(text, sizeof(text), + "** %llu printk messages dropped **\n", + log_first_seq - console_seq); /* messages are gone, move to first one */ console_seq = log_first_seq; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index f32095d7aa5e4e5c771f89639b36afb55b08b077..424a04cdda1d6cb112c939a16c3a657326026db6 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -365,6 +365,26 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) return !err; } +static int check_ptrace_options(unsigned long data) +{ + if (data & ~(unsigned long)PTRACE_O_MASK) + return -EINVAL; + + if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { + if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) || + !IS_ENABLED(CONFIG_SECCOMP)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (seccomp_mode(¤t->seccomp) != SECCOMP_MODE_DISABLED || + current->ptrace & PT_SUSPEND_SECCOMP) + return -EPERM; + } + return 0; +} + static int ptrace_attach(struct task_struct *task, long request, unsigned long addr, unsigned long flags) @@ -376,8 +396,16 @@ static int ptrace_attach(struct task_struct *task, long request, if (seize) { if (addr != 0) goto out; + /* + * This duplicates the check in check_ptrace_options() because + * ptrace_attach() and ptrace_setoptions() have historically + * used different error codes for unknown ptrace options. + */ if (flags & ~(unsigned long)PTRACE_O_MASK) goto out; + retval = check_ptrace_options(flags); + if (retval) + return retval; flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT); } else { flags = PT_PTRACED; @@ -650,22 +678,11 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds static int ptrace_setoptions(struct task_struct *child, unsigned long data) { unsigned flags; + int ret; - if (data & ~(unsigned long)PTRACE_O_MASK) - return -EINVAL; - - if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { - if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) || - !IS_ENABLED(CONFIG_SECCOMP)) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (seccomp_mode(¤t->seccomp) != SECCOMP_MODE_DISABLED || - current->ptrace & PT_SUSPEND_SECCOMP) - return -EPERM; - } + ret = check_ptrace_options(data); + if (ret) + return ret; /* Avoid intermediate state when all opts are cleared */ flags = child->ptrace; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b36a3b4c60e9c3861a26971d4465c35fe50cf95d..b091537102590fa56c7f57a29a525bea9d992948 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2357,8 +2357,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } -void sched_post_fork(struct task_struct *p) +void sched_cgroup_fork(struct task_struct *p) { + unsigned long flags; /* @@ -2369,6 +2370,9 @@ void sched_post_fork(struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); +#ifdef CONFIG_CGROUP_SCHED + p->sched_task_group = task_group(current); + #endif rseq_migrate(p); /* * We're setting the CPU for the first time, we don't migrate, @@ -6981,13 +6985,10 @@ static int tg_change_scheduler(struct task_group *tg, void *data) struct cgroup_subsys_state *css = &tg->css; tg->qos_level = qos_level; - if (qos_level == -1) { + if (qos_level == -1) policy = SCHED_IDLE; - cfs_bandwidth_usage_inc(); - } else { + else policy = SCHED_NORMAL; - cfs_bandwidth_usage_dec(); - } param.sched_priority = 0; css_task_iter_start(css, 0, &it); @@ -7015,6 +7016,13 @@ static int cpu_qos_write(struct cgroup_subsys_state *css, if (tg->qos_level == -1 && qos_level == 0) return -EINVAL; + cpus_read_lock(); + if (qos_level == -1) + cfs_bandwidth_usage_inc(); + else + cfs_bandwidth_usage_dec(); + cpus_read_unlock(); + rcu_read_lock(); walk_tg_tree_from(tg, tg_change_scheduler, tg_nop, (void *)(&qos_level)); rcu_read_unlock(); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8dd27c1fbb29fa8b7df57acadfed357c712fbf52..0df2448eb5d1c6dc825f3162725331cf8d377a2e 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -152,10 +152,10 @@ void account_guest_time(struct task_struct *p, u64 cputime) /* Add guest time to cpustat. */ if (task_nice(p) > 0) { - cpustat[CPUTIME_NICE] += cputime; + task_group_account_field(p, CPUTIME_NICE, cputime); cpustat[CPUTIME_GUEST_NICE] += cputime; } else { - cpustat[CPUTIME_USER] += cputime; + task_group_account_field(p, CPUTIME_USER, cputime); cpustat[CPUTIME_GUEST] += cputime; } } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 52029f3a74394b1d02956f816d10e8a67244bf02..89937f7ca0bdf75a223a641414dbbf85f44de624 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5059,6 +5059,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); +#ifdef CONFIG_QOS_SCHED + INIT_LIST_HEAD(&cfs_rq->qos_throttled_list); +#endif } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -6253,7 +6256,8 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t return -1; for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(cpu, &p->cpus_allowed) || + !cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; @@ -6925,7 +6929,8 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); - list_add(&cfs_rq->throttled_list, &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); + list_add(&cfs_rq->qos_throttled_list, + &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); } static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) @@ -6943,7 +6948,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) update_rq_clock(rq); cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; - list_del_init(&cfs_rq->throttled_list); + list_del_init(&cfs_rq->qos_throttled_list); /* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); @@ -6984,7 +6989,7 @@ static int __unthrottle_qos_cfs_rqs(int cpu) int res = 0; list_for_each_entry_safe(cfs_rq, tmp_rq, &per_cpu(qos_throttled_cfs_rq, cpu), - throttled_list) { + qos_throttled_list) { if (cfs_rq_throttled(cfs_rq)) { unthrottle_qos_cfs_rq(cfs_rq); res++; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5a9c279567921b5e0d258921dd86f0af400cbc42..301ba04d913005412fe73e9a1b453364c16ae018 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -50,11 +50,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period_timer.function = sched_rt_period_timer; } -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b) { - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return; - raw_spin_lock(&rt_b->rt_runtime_lock); if (!rt_b->rt_period_active) { rt_b->rt_period_active = 1; @@ -72,6 +69,14 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) + return; + + do_start_rt_bandwidth(rt_b); +} + void init_rt_rq(struct rt_rq *rt_rq) { struct rt_prio_array *array; @@ -979,13 +984,17 @@ static void update_curr_rt(struct rq *rq) for_each_sched_rt_entity(rt_se) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); + int exceeded; if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) + exceeded = sched_rt_runtime_exceeded(rt_rq); + if (exceeded) resched_curr(rq); raw_spin_unlock(&rt_rq->rt_runtime_lock); + if (exceeded) + do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); } } } @@ -2654,8 +2663,12 @@ static int sched_rt_global_validate(void) static void sched_rt_do_global(void) { + unsigned long flags; + + raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); def_rt_bandwidth.rt_runtime = global_rt_runtime(); def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); + raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); } int sched_rt_handler(struct ctl_table *table, int write, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1aaff1aa89f62b1a44df2f87a6f432ce7b0dc474..ae3068153093881ef4f6743377c2ef2869e22dd6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -592,6 +592,11 @@ struct cfs_rq { #endif KABI_RESERVE(2) +#ifndef __GENKSYMS__ +#ifdef CONFIG_QOS_SCHED + struct list_head qos_throttled_list; +#endif +#endif }; static inline int rt_bandwidth_enabled(void) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index fd023ac24e10ea16dde538a4a3274387e9b7df8d..369b63ac7cf9b6aa8e5d310f5a26d20331dfef9d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -28,6 +28,9 @@ #include #include +/* Not exposed in headers: strictly internal use only. */ +#define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1) + #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER #include #endif @@ -629,6 +632,7 @@ static void __secure_computing_strict(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif + current->seccomp.mode = SECCOMP_MODE_DEAD; seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true); do_exit(SIGKILL); } @@ -743,6 +747,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, case SECCOMP_RET_KILL_THREAD: case SECCOMP_RET_KILL_PROCESS: default: + current->seccomp.mode = SECCOMP_MODE_DEAD; seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ if (action == SECCOMP_RET_KILL_PROCESS || @@ -793,6 +798,11 @@ int __secure_computing(const struct seccomp_data *sd) return 0; case SECCOMP_MODE_FILTER: return __seccomp_filter(this_syscall, sd, false); + /* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */ + case SECCOMP_MODE_DEAD: + WARN_ON_ONCE(1); + do_exit(SIGKILL); + return -1; default: BUG(); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5445dead9911a3933a36b72f3982460fd627b056..d2eebbcdff52e7cb6ba6edd115667bdc0135212f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -257,6 +257,28 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, #endif +#ifdef CONFIG_BPF_SYSCALL +static int bpf_unpriv_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, unpriv_enable = *(int *)table->data; + bool locked_state = unpriv_enable == 1; + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + tmp.data = &unpriv_enable; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (locked_state && unpriv_enable != 1) + return -EPERM; + *(int *)table->data = unpriv_enable; + } + return ret; +} +#endif + static struct ctl_table kern_table[]; static struct ctl_table vm_table[]; static struct ctl_table fs_table[]; @@ -1247,10 +1269,9 @@ static struct ctl_table kern_table[] = { .data = &sysctl_unprivileged_bpf_disabled, .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &one, + .proc_handler = bpf_unpriv_handler, + .extra1 = &zero, + .extra2 = &two, }, #endif #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8f20a658dabb4dd54ee33ed65666211f4e120889..feed30698280f9e84723d3b41eb3b68a651bf1e6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -232,6 +232,10 @@ __setup("trace_clock=", set_trace_boot_clock); static int __init set_tracepoint_printk(char *str) { + /* Ignore the "tp_printk_stop_on_boot" param */ + if (*str == '_') + return 0; + if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) tracepoint_printk = 1; return 1; @@ -2786,7 +2790,7 @@ struct trace_buffer_struct { char buffer[4][TRACE_BUF_SIZE]; }; -static struct trace_buffer_struct *trace_percpu_buffer; +static struct trace_buffer_struct __percpu *trace_percpu_buffer; /* * Thise allows for lockless recording. If we're nested too deeply, then @@ -2796,7 +2800,7 @@ static char *get_trace_buf(void) { struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); - if (!buffer || buffer->nesting >= 4) + if (!trace_percpu_buffer || buffer->nesting >= 4) return NULL; buffer->nesting++; @@ -2815,7 +2819,7 @@ static void put_trace_buf(void) static int alloc_percpu_trace_buffer(void) { - struct trace_buffer_struct *buffers; + struct trace_buffer_struct __percpu *buffers; buffers = alloc_percpu(struct trace_buffer_struct); if (WARN(!buffers, "Could not allocate percpu trace_printk buffer")) diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 370724b4539185c9cc9b6323feab36b414836903..8d7c6d3f1daa71a0b1d6221e26999beb618f2a70 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -46,11 +46,10 @@ void bacct_add_tsk(struct user_namespace *user_ns, /* Convert to seconds for btime */ do_div(delta, USEC_PER_SEC); stats->ac_btime = get_seconds() - delta; - if (thread_group_leader(tsk)) { + if (tsk->flags & PF_EXITING) stats->ac_exitcode = tsk->exit_code; - if (tsk->flags & PF_FORKNOEXEC) - stats->ac_flag |= AFORK; - } + if (thread_group_leader(tsk) && (tsk->flags & PF_FORKNOEXEC)) + stats->ac_flag |= AFORK; if (tsk->flags & PF_SUPERPRIV) stats->ac_flag |= ASU; if (tsk->flags & PF_DUMPCORE) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 25668139fc1f70dc6e5c33acee69781f533cf128..b80320956caf4829af4d444ba4085eb7fbca65cb 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -506,6 +506,7 @@ static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t by return 0; pipe->nrbufs++; buf->ops = &page_cache_pipe_buf_ops; + buf->flags = 0; get_page(buf->page = page); buf->offset = offset; buf->len = bytes; @@ -630,6 +631,7 @@ static size_t push_pipe(struct iov_iter *i, size_t size, break; pipe->nrbufs++; pipe->bufs[idx].ops = &default_pipe_buf_ops; + pipe->bufs[idx].flags = 0; pipe->bufs[idx].page = page; pipe->bufs[idx].offset = 0; if (left <= PAGE_SIZE) { diff --git a/mm/clear_freelist_page.c b/mm/clear_freelist_page.c index 69975f458dc79f5acaa78190da5d05cf98097f95..1ea59a3e527fef571612a2f08fe75844e5c7984b 100644 --- a/mm/clear_freelist_page.c +++ b/mm/clear_freelist_page.c @@ -13,8 +13,10 @@ #include #include #include +#include #include +#define CFP_DEFAULT_TIMEOUT 2000 #define for_each_populated_zone_pgdat(pgdat, zone) \ for (zone = pgdat->node_zones; \ zone; \ @@ -32,6 +34,7 @@ static DECLARE_WAIT_QUEUE_HEAD(clear_freelist_wait); static DEFINE_MUTEX(clear_freelist_lock); static atomic_t clear_freelist_workers; static atomic_t clear_pages_num; +static ulong cfp_timeout_ms = CFP_DEFAULT_TIMEOUT; static int one = 1; /* @@ -51,15 +54,25 @@ static struct zone *next_pgdat_zone(struct zone *zone) static void clear_pgdat_freelist_pages(struct work_struct *work) { struct pgdat_entry *entry = container_of(work, struct pgdat_entry, work); + u64 cfp_timeout_ns = cfp_timeout_ms * NSEC_PER_MSEC; struct pglist_data *pgdat = entry->pgdat; unsigned long flags, order, t; struct page *page; struct zone *zone; + u64 start, now; + + start = sched_clock(); for_each_populated_zone_pgdat(pgdat, zone) { spin_lock_irqsave(&zone->lock, flags); for_each_migratetype_order(order, t) { list_for_each_entry(page, &zone->free_area[order].free_list[t], lru) { + now = sched_clock(); + if (unlikely(now - start > cfp_timeout_ns)) { + spin_unlock_irqrestore(&zone->lock, flags); + goto out; + } + #ifdef CONFIG_KMAP_LOCAL int i; @@ -77,6 +90,8 @@ static void clear_pgdat_freelist_pages(struct work_struct *work) cond_resched(); } + +out: kfree(entry); if (atomic_dec_and_test(&clear_freelist_workers)) @@ -154,10 +169,20 @@ static struct ctl_table sys_ctl_table[] = { { } }; +static bool clear_freelist_enabled; +static int __init setup_clear_freelist(char *str) +{ + clear_freelist_enabled = true; + return 1; +} +__setup("clear_freelist", setup_clear_freelist); + static int __init clear_freelist_init(void) { - register_sysctl_table(sys_ctl_table); + if (clear_freelist_enabled) + register_sysctl_table(sys_ctl_table); return 0; } module_init(clear_freelist_init); +module_param(cfp_timeout_ms, ulong, 0644); diff --git a/mm/filemap.c b/mm/filemap.c index a89d70097e686cc52d4668480e644755990bcd82..9b6e72e14a04b8dfbc52bb4e83007a283441c703 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -291,6 +291,7 @@ static void unaccount_page_cache_page(struct address_space *mapping, __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); if (PageSwapBacked(page)) { __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); + shmem_reliable_page_counter(page, -nr); if (PageTransHuge(page)) __dec_node_page_state(page, NR_SHMEM_THPS); } else { @@ -895,8 +896,10 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) */ if (!PageHuge(new)) __inc_node_page_state(new, NR_FILE_PAGES); - if (PageSwapBacked(new)) + if (PageSwapBacked(new)) { __inc_node_page_state(new, NR_SHMEM); + shmem_reliable_page_counter(new, 1); + } xa_unlock_irqrestore(&mapping->i_pages, flags); mem_cgroup_migrate(old, new); radix_tree_preload_end(); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 68cec97bbd066f11203728b7ee585c5eeff8ef41..5b6a050fd6c15c991a31fa06c51aeeadd34fdde5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1444,6 +1444,33 @@ pgoff_t hugetlb_basepage_index(struct page *page) return (index << compound_order(page_head)) + compound_idx; } +#define HUGE_PAGE_BOOTMEM_ALLOC 0 +#define HUGE_PAGE_FRESH_ALLOC 1 + +static u64 normal_page_reserve_sz; + +static int __init early_normal_page_reserve(char *p) +{ + unsigned long long size; + + if (!p) + return 1; + + size = memparse(p, &p); + if (*p) { + pr_warn("HugeTLB: Invalid normal page reserved size\n"); + return 1; + } + + normal_page_reserve_sz = size & PAGE_MASK; + + pr_info("HugeTLB: Normal page reserved %lldMB\n", + normal_page_reserve_sz >> 20); + + return 0; +} +early_param("hugepage_prohibit_sz", early_normal_page_reserve); + static struct page *alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) @@ -1491,6 +1518,45 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, return page; } +static bool __ref huge_page_limit_check(int type, size_t hsize, int nid) +{ + u64 mem_usable = 0; + char *str = NULL; + char buf[32]; + + if (!normal_page_reserve_sz) + return true; + + if (system_state > SYSTEM_SCHEDULING) + return true; + + if (normal_page_reserve_sz >= memblock_phys_mem_size()) { + mem_usable = memblock_phys_mem_size(); + str = "physical memory"; + goto out; + } + + if (type == HUGE_PAGE_BOOTMEM_ALLOC) { + mem_usable = memblock_phys_mem_size() - memblock_reserved_size(); + str = "memblock usable"; + } else if (type == HUGE_PAGE_FRESH_ALLOC) { + mem_usable = nr_free_pages() << PAGE_SHIFT; + str = "free page"; + } + + if (mem_usable < normal_page_reserve_sz + hsize) + goto out; + + return true; +out: + string_get_size(hsize, 1, STRING_UNITS_2, buf, 32); + pr_info("HugeTLB: allocating(%s) + Normal pages reserved(%lldMB) node%d exceed %s size(%lldMB)\n", + buf, normal_page_reserve_sz >> 20, + nid, str, mem_usable >> 20); + + return false; +} + /* * Common helper to allocate a fresh hugetlb page. All specific allocators * should use this function to get new hugetlb pages @@ -1501,6 +1567,9 @@ static struct page *alloc_fresh_huge_page(struct hstate *h, { struct page *page; + if (!huge_page_limit_check(HUGE_PAGE_FRESH_ALLOC, huge_page_size(h), nid)) + return NULL; + if (hstate_is_gigantic(h)) page = alloc_gigantic_page(h, gfp_mask, nid, nmask); else @@ -2251,6 +2320,10 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) if (nid != NUMA_NO_NODE && nid >= nr_online_nodes) return 0; + + if (!huge_page_limit_check(HUGE_PAGE_BOOTMEM_ALLOC, huge_page_size(h), nid)) + return 0; + /* do node specific alloc */ if (nid != NUMA_NO_NODE) { m = memblock_virt_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 301d6aa079d7b83899fb4f1357a50d6d35336db3..2975fc124cb6a404a2cc71f956127288a3c87932 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1559,6 +1559,7 @@ static void collapse_shmem(struct mm_struct *mm, ClearPageActive(page); ClearPageUnevictable(page); unlock_page(page); + shmem_reliable_page_counter(page, -1); put_page(page); index++; } @@ -1573,6 +1574,7 @@ static void collapse_shmem(struct mm_struct *mm, mem_cgroup_commit_charge(new_page, memcg, false, true); count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); lru_cache_add_anon(new_page); + shmem_reliable_page_counter(new_page, 1 << HPAGE_PMD_ORDER); /* * Remove pte page tables, so we can re-fault the page as huge. diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 62df78657ff94fbd4615b6efe02d4046929d1538..f8b8543d7933c4fbe22654ab203ecca751040d23 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -9,8 +9,7 @@ #include #include #include - -#define MEM_RELIABLE_RESERVE_MIN (256UL << 20) +#include enum mem_reliable_types { MEM_RELIABLE_ALL, @@ -30,16 +29,21 @@ bool reliable_allow_fallback __read_mostly = true; bool shmem_reliable __read_mostly = true; struct percpu_counter reliable_shmem_used_nr_page __read_mostly; DEFINE_PER_CPU(long, nr_reliable_buddy_pages); -unsigned long nr_reliable_reserve_pages = MEM_RELIABLE_RESERVE_MIN / PAGE_SIZE; long shmem_reliable_nr_page = LONG_MAX; bool pagecache_use_reliable_mem __read_mostly = true; -DEFINE_PER_CPU(long, pagecache_reliable_pages); -DEFINE_PER_CPU(long, anon_reliable_pages); +struct percpu_counter pagecache_reliable_pages; +struct percpu_counter anon_reliable_pages; static unsigned long zero; static unsigned long reliable_pagecache_max_bytes = ULONG_MAX; +bool mem_reliable_counter_initialized(void) +{ + return likely(percpu_counter_initialized(&pagecache_reliable_pages)) && + likely((percpu_counter_initialized(&anon_reliable_pages))); +} + bool mem_reliable_status(void) { return mem_reliable_is_enabled(); @@ -66,9 +70,9 @@ void reliable_lru_add_batch(int zid, enum lru_list lru, int val) if (zid < ZONE_MOVABLE && zid >= 0) { if (is_file_lru(lru)) - this_cpu_add(pagecache_reliable_pages, val); + percpu_counter_add(&pagecache_reliable_pages, val); else if (is_anon_lru(lru)) - this_cpu_add(anon_reliable_pages, val); + percpu_counter_add(&anon_reliable_pages, val); } } @@ -78,14 +82,14 @@ void reliable_lru_add(enum lru_list lru, struct page *page, int val) return; if (is_file_lru(lru)) - this_cpu_add(pagecache_reliable_pages, val); + percpu_counter_add(&pagecache_reliable_pages, val); else if (is_anon_lru(lru)) - this_cpu_add(anon_reliable_pages, val); + percpu_counter_add(&anon_reliable_pages, val); else if (lru == LRU_UNEVICTABLE) { if (PageAnon(page)) - this_cpu_add(anon_reliable_pages, val); + percpu_counter_add(&anon_reliable_pages, val); else - this_cpu_add(pagecache_reliable_pages, val); + percpu_counter_add(&pagecache_reliable_pages, val); } } @@ -122,6 +126,11 @@ void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn) if (!reliable_enabled) return; + if (is_kdump_kernel()) { + pr_err("init failed, the kdump is in progress\n"); + return; + } + if (atomic_long_read(&total_reliable_mem) == 0) { memset(zone_movable_pfn, 0, sizeof(unsigned long) * MAX_NUMNODES); @@ -187,22 +196,19 @@ static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) void reliable_report_meminfo(struct seq_file *m) { - bool pagecache_enabled = pagecache_reliable_is_enabled(); - long nr_pagecache_pages = 0; - long nr_anon_pages = 0; + s64 nr_pagecache_pages = 0; + s64 nr_anon_pages = 0; long nr_buddy_pages = 0; int cpu; if (!mem_reliable_is_enabled()) return; - for_each_possible_cpu(cpu) { + for_each_possible_cpu(cpu) nr_buddy_pages += per_cpu(nr_reliable_buddy_pages, cpu); - nr_anon_pages += per_cpu(anon_reliable_pages, cpu); - if (pagecache_enabled) - nr_pagecache_pages += - per_cpu(pagecache_reliable_pages, cpu); - } + + nr_anon_pages = percpu_counter_sum_positive(&anon_reliable_pages); + nr_pagecache_pages = percpu_counter_sum_positive(&pagecache_reliable_pages); show_val_kb(m, "ReliableTotal: ", total_reliable_mem_sz() >> PAGE_SHIFT); @@ -216,13 +222,14 @@ void reliable_report_meminfo(struct seq_file *m) percpu_counter_sum(&reliable_shmem_used_nr_page)); } - if (pagecache_enabled) { + if (pagecache_reliable_is_enabled()) { unsigned long num = 0; num += global_node_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE); num += global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE); show_val_kb(m, "FileCache: ", num); - show_val_kb(m, "ReliableFileCache:", nr_pagecache_pages); + seq_printf(m, "ReliableFileCache: %8llu kB\n", + nr_pagecache_pages << (PAGE_SHIFT - 10)); } } @@ -327,29 +334,6 @@ int reliable_debug_handler(struct ctl_table *table, int write, return ret; } -static unsigned long sysctl_reliable_reserve_size = MEM_RELIABLE_RESERVE_MIN; - -int reliable_reserve_size_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) -{ - unsigned long *data_ptr = (unsigned long *)(table->data); - unsigned long old = *data_ptr; - int ret; - - ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); - if (ret == 0 && write) { - if (*data_ptr > total_reliable_mem_sz() || - *data_ptr < MEM_RELIABLE_RESERVE_MIN) { - *data_ptr = old; - return -EINVAL; - } - - nr_reliable_reserve_pages = *data_ptr / PAGE_SIZE; - } - - return ret; -} - #ifdef CONFIG_SHMEM static unsigned long sysctl_shmem_reliable_bytes_limit = ULONG_MAX; @@ -406,13 +390,6 @@ static struct ctl_table reliable_ctl_table[] = { .mode = 0600, .proc_handler = reliable_debug_handler, }, - { - .procname = "reliable_reserve_size", - .data = &sysctl_reliable_reserve_size, - .maxlen = sizeof(sysctl_reliable_reserve_size), - .mode = 0644, - .proc_handler = reliable_reserve_size_handler, - }, #ifdef CONFIG_SHMEM { .procname = "shmem_reliable_bytes_limit", @@ -445,8 +422,7 @@ static struct ctl_table reliable_dir_table[] = { void page_cache_prepare_alloc(gfp_t *gfp) { - long nr_reliable = 0; - int cpu; + s64 nr_reliable = 0; if (!mem_reliable_is_enabled()) return; @@ -454,11 +430,7 @@ void page_cache_prepare_alloc(gfp_t *gfp) if (!pagecache_reliable_is_enabled()) goto no_reliable; - for_each_possible_cpu(cpu) - nr_reliable += this_cpu_read(pagecache_reliable_pages); - - if (nr_reliable < 0) - goto no_reliable; + nr_reliable = percpu_counter_read_positive(&pagecache_reliable_pages); if (nr_reliable > reliable_pagecache_max_bytes >> PAGE_SHIFT) goto no_reliable; @@ -480,9 +452,12 @@ static int __init reliable_sysctl_init(void) return -1; } + percpu_counter_init(&pagecache_reliable_pages, 0, GFP_KERNEL); + percpu_counter_init(&anon_reliable_pages, 0, GFP_KERNEL); + return 0; } -late_initcall(reliable_sysctl_init); +arch_initcall(reliable_sysctl_init); #else static void mem_reliable_ctrl_bit_disabled(int idx) {} #endif @@ -515,21 +490,40 @@ static void mem_reliable_feature_disable(int idx) void reliable_show_mem_info(void) { - int cpu; - long num = 0; + s64 nr_pagecache_pages = 0; + s64 nr_anon_pages = 0; if (!mem_reliable_is_enabled()) return; - for_each_possible_cpu(cpu) { - num += per_cpu(anon_reliable_pages, cpu); - num += per_cpu(pagecache_reliable_pages, cpu); + nr_anon_pages = percpu_counter_sum_positive(&anon_reliable_pages); + nr_pagecache_pages = percpu_counter_sum_positive(&pagecache_reliable_pages); + + pr_info("ReliableTotal: %lu kB\n", total_reliable_mem_sz() >> 10); + pr_info("ReliableUsed: %lu kB\n", used_reliable_mem_sz() >> 10); + pr_info("ReliableTaskLimit: %lu kB\n", task_reliable_limit >> 10); + pr_info("ReliableTaskUsed: %lld kB\n", + (nr_anon_pages + nr_pagecache_pages) << (PAGE_SHIFT - 10)); + + if (shmem_reliable_is_enabled()) { + pr_info("ReliableShmemPagesLimit: %ld\n", + shmem_reliable_nr_page); + pr_info("ReliableShmem: %llu kB\n", + percpu_counter_sum(&reliable_shmem_used_nr_page) + << (PAGE_SHIFT - 10)); } - pr_info("ReliableTotal: %lu kB", total_reliable_mem_sz() >> 10); - pr_info("ReliableUsed: %lu kB", used_reliable_mem_sz() >> 10); - pr_info("task_reliable_limit: %lu kB", task_reliable_limit >> 10); - pr_info("reliable_user_used: %ld kB", num << (PAGE_SHIFT - 10)); + if (pagecache_reliable_is_enabled()) { + unsigned long num = 0; + + num += global_node_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE); + num += global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE); + pr_info("ReliableFileCacheLimit: %lu kB\n", + reliable_pagecache_max_bytes >> 10); + pr_info("FileCache: %lu kB\n", num << (PAGE_SHIFT - 10)); + pr_info("ReliableFileCache: %llu kB\n", + nr_pagecache_pages << (PAGE_SHIFT - 10)); + } } void mem_reliable_out_of_memory(gfp_t gfp_mask, unsigned int order, diff --git a/mm/memblock.c b/mm/memblock.c index 80c6975ace6f29b4bed3d3048b13ec4bc2ef6d7d..daa2c42a02d97f08527e4059f0f5612b919199ee 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -330,14 +330,20 @@ void __init memblock_discard(void) addr = __pa(memblock.reserved.regions); size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.reserved.max); - __memblock_free_late(addr, size); + if (memblock_reserved_in_slab) + kfree(memblock.reserved.regions); + else + __memblock_free_late(addr, size); } if (memblock.memory.regions != memblock_memory_init_regions) { addr = __pa(memblock.memory.regions); size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.memory.max); - __memblock_free_late(addr, size); + if (memblock_memory_in_slab) + kfree(memblock.memory.regions); + else + __memblock_free_late(addr, size); } } #endif diff --git a/mm/memfd.c b/mm/memfd.c index 9e68a4320a0e4c683826e19e6b2d4758fb1c83f8..2d19288a093ea820d68779af509970838b4c8500 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -34,26 +34,35 @@ static void memfd_tag_pins(struct address_space *mapping) void __rcu **slot; pgoff_t start; struct page *page; - unsigned int tagged = 0; + int latency = 0; + int cache_count; lru_add_drain(); start = 0; xa_lock_irq(&mapping->i_pages); radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { + cache_count = 1; page = radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); - if (!page || radix_tree_exception(page)) { + if (!page || radix_tree_exception(page) || PageTail(page)) { if (radix_tree_deref_retry(page)) { slot = radix_tree_iter_retry(&iter); continue; } - } else if (page_count(page) - page_mapcount(page) > 1) { - radix_tree_tag_set(&mapping->i_pages, iter.index, - MEMFD_TAG_PINNED); + } else { + if (PageTransHuge(page) && !PageHuge(page)) + cache_count = HPAGE_PMD_NR; + if (cache_count != + page_count(page) - total_mapcount(page)) { + radix_tree_tag_set(&mapping->i_pages, + iter.index, MEMFD_TAG_PINNED); + } } - if (++tagged % 1024) + latency += cache_count; + if (latency < 1024) continue; + latency = 0; slot = radix_tree_iter_resume(slot, &iter); xa_unlock_irq(&mapping->i_pages); @@ -79,6 +88,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) pgoff_t start; struct page *page; int error, scan; + int cache_count; memfd_tag_pins(mapping); @@ -107,8 +117,12 @@ static int memfd_wait_for_pins(struct address_space *mapping) page = NULL; } - if (page && - page_count(page) - page_mapcount(page) != 1) { + cache_count = 1; + if (page && PageTransHuge(page) && !PageHuge(page)) + cache_count = HPAGE_PMD_NR; + + if (page && cache_count != + page_count(page) - total_mapcount(page)) { if (scan < LAST_SCAN) goto continue_resched; diff --git a/mm/memory.c b/mm/memory.c index d4853970a7c104807337806a2d3b6a62adbd10fa..8be034c0f10bcc1fa36bcf207d09387341304ec4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4721,16 +4721,15 @@ static int clear_gigantic_page_chunk(unsigned long start, unsigned long end, struct cgp_args *args) { struct page *base_page = args->base_page; - struct page *p = base_page; + struct page *p = mem_map_offset(base_page, start); unsigned long addr = args->addr; unsigned long i; might_sleep(); - for (i = start; i < end; ++i) { + for (i = start; i < end; i++, p = mem_map_next(p, base_page, i)) { cond_resched(); - clear_user_highpage(p, addr + i * PAGE_SIZE); - - p = mem_map_next(p, base_page, i); + if (p) + clear_user_highpage(p, addr + i * PAGE_SIZE); } return KTASK_RETURN_SUCCESS; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 76a577cfc27780770a35817b2f364227948141e5..36961d6d364e4518c327f21fbc0a61bce22cf1a6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1259,11 +1259,10 @@ static struct page *new_page(struct page *page, unsigned long start) } #endif -static long do_mbind(unsigned long start, unsigned long len, - unsigned short mode, unsigned short mode_flags, - nodemask_t *nmask, unsigned long flags) +long __do_mbind(unsigned long start, unsigned long len, + unsigned short mode, unsigned short mode_flags, + nodemask_t *nmask, unsigned long flags, struct mm_struct *mm) { - struct mm_struct *mm = current->mm; struct mempolicy *new; unsigned long end; int err; @@ -1364,6 +1363,13 @@ static long do_mbind(unsigned long start, unsigned long len, return err; } +static long do_mbind(unsigned long start, unsigned long len, + unsigned short mode, unsigned short mode_flags, + nodemask_t *nmask, unsigned long flags) +{ + return __do_mbind(start, len, mode, mode_flags, nmask, flags, current->mm); +} + /* * User space interface with variable sized bitmaps for nodelists. */ diff --git a/mm/migrate.c b/mm/migrate.c index f7721d0aece5f923237f9b4713a6da3415c6c48a..3a20900e19e089b47a92151c15eb36d9c525f1dd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -473,6 +473,10 @@ int migrate_page_move_mapping(struct address_space *mapping, pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); + if (pslot == NULL) { + xa_unlock_irq(&mapping->i_pages); + return -EAGAIN; + } expected_count += hpage_nr_pages(page) + page_has_private(page); if (page_count(page) != expected_count || @@ -548,6 +552,11 @@ int migrate_page_move_mapping(struct address_space *mapping, xa_unlock(&mapping->i_pages); /* Leave irq disabled to prevent preemption while updating stats */ + if (PageSwapBacked(page) && !PageSwapCache(page)) { + shmem_reliable_page_counter(page, -nr); + shmem_reliable_page_counter(newpage, nr); + } + /* * If moved to a different zone then also account * the page for that zone. Other VM counters will be @@ -591,6 +600,10 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, xa_lock_irq(&mapping->i_pages); pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); + if (pslot == NULL) { + xa_unlock_irq(&mapping->i_pages); + return -EAGAIN; + } expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || diff --git a/mm/mmap.c b/mm/mmap.c index e529384b8ac62bcf98f6f05f0b7271ee585a7e5c..ce3fba7c31cb0283551f226938de48f3b9850094 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2382,6 +2382,10 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, sp_area_work_around(&info, flags); + /* Align vmstart to len */ + if (filp && (flags & MAP_ALIGN)) + info.align_mask = ((len >> PAGE_SHIFT) - 1) << PAGE_SHIFT; + return vm_unmapped_area(&info); } #endif @@ -2437,6 +2441,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, sp_area_work_around(&info, flags); + /* Align vmstart to len */ + if (filp && (flags & MAP_ALIGN)) + info.align_mask = ((len >> PAGE_SHIFT) - 1) << PAGE_SHIFT; + addr = vm_unmapped_area(&info); /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd4354d9ebad046f13f32e84fc742500e1261da0..3ae924633e2c597f333fe906c23a1e0e722ac7dc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3671,60 +3671,6 @@ __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, return page; } -#ifdef CONFIG_MEMORY_RELIABLE -static inline struct zone *reliable_fb_find_zone(gfp_t gfp_mask, - struct alloc_context *ac) -{ - if (!reliable_allow_fb_enabled()) - return NULL; - - /* dst nodemask may don't have zone we want, fallback here */ - if ((gfp_mask & __GFP_THISNODE) && (ac->high_zoneidx == ZONE_NORMAL) && - (gfp_mask & ___GFP_RELIABILITY)) { - struct zoneref *ref = first_zones_zonelist( - ac->zonelist, ZONE_MOVABLE, ac->nodemask); - return ref->zone; - } - - return NULL; -} - -static inline struct page * -reliable_fb_before_oom(gfp_t gfp_mask, int order, - const struct alloc_context *ac) -{ - if (!reliable_allow_fb_enabled()) - return NULL; - - /* key user process alloc mem from movable zone to avoid oom */ - if ((ac->high_zoneidx == ZONE_NORMAL) && - (gfp_mask & ___GFP_RELIABILITY)) { - struct alloc_context tmp_ac = *ac; - - tmp_ac.high_zoneidx = ZONE_MOVABLE; - tmp_ac.preferred_zoneref = first_zones_zonelist( - ac->zonelist, ZONE_MOVABLE, ac->nodemask); - return get_page_from_freelist( - (gfp_mask | __GFP_HARDWALL) & ~__GFP_DIRECT_RECLAIM, - order, ALLOC_WMARK_HIGH | ALLOC_CPUSET, &tmp_ac); - } - - return NULL; -} -#else -static inline struct zone *reliable_fb_find_zone(gfp_t gfp_mask, - struct alloc_context *ac) -{ - return NULL; -} - -static inline struct page *reliable_fb_before_oom(gfp_t gfp_mask, int order, - const struct alloc_context *ac) -{ - return NULL; -} -#endif - static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac, unsigned long *did_some_progress) @@ -3763,10 +3709,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (page) goto out; - page = reliable_fb_before_oom(gfp_mask, order, ac); - if (page) - goto out; - /* Coredumps can quickly deplete all memory reserves */ if (current->flags & PF_DUMPCORE) goto out; @@ -4329,6 +4271,29 @@ check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) return false; } +#ifdef CONFIG_MEMORY_RELIABLE +static inline void mem_reliable_fallback_slowpath(gfp_t gfp_mask, + struct alloc_context *ac) +{ + if (!reliable_allow_fb_enabled()) + return; + + if (gfp_mask & __GFP_NOFAIL) + return; + + if ((ac->high_zoneidx == ZONE_NORMAL) && + (gfp_mask & ___GFP_RELIABILITY)) { + ac->high_zoneidx = gfp_zone(gfp_mask & ~___GFP_RELIABILITY); + ac->preferred_zoneref = first_zones_zonelist( + ac->zonelist, ac->high_zoneidx, ac->nodemask); + return; + } +} +#else +static inline void mem_reliable_fallback_slowpath(gfp_t gfp_mask, + struct alloc_context *ac) {} +#endif + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) @@ -4374,17 +4339,14 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); - if (!ac->preferred_zoneref->zone) { - ac->preferred_zoneref->zone = - reliable_fb_find_zone(gfp_mask, ac); - - if (!ac->preferred_zoneref->zone) - goto nopage; - } + if (!ac->preferred_zoneref->zone) + goto nopage; if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, gfp_mask, ac); + mem_reliable_fallback_slowpath(gfp_mask, ac); + /* * The adjusted alloc_flags might result in immediate success, so try * that first @@ -4639,69 +4601,93 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) ac->high_zoneidx, ac->nodemask); } -/* - * return false means this allocation is limit by reliable user limit and - * this will lead to pagefault_out_of_memory() - */ -static inline bool prepare_before_alloc(gfp_t *gfp_mask, unsigned int order) +static inline void prepare_before_alloc(gfp_t *gfp_mask) { gfp_t gfp_ori = *gfp_mask; + bool zone_movable; + *gfp_mask &= gfp_allowed_mask; if (!mem_reliable_is_enabled()) - return true; + return; - if (*gfp_mask & __GFP_NOFAIL) - return true; + /* + * memory reliable only handle memory allocation from movable zone + * (force alloc from non-movable zone or force alloc from movable + * zone) to get total isolation. + */ + zone_movable = gfp_zone(*gfp_mask) == ZONE_MOVABLE; + if (!zone_movable) + return; if (gfp_ori & ___GFP_RELIABILITY) { - if (!(gfp_ori & __GFP_HIGHMEM) || !(gfp_ori & __GFP_MOVABLE)) - return true; + *gfp_mask |= ___GFP_RELIABILITY; + return; + } - if (mem_reliable_watermark_ok(1 << order)) { - *gfp_mask |= ___GFP_RELIABILITY; - return true; - } + if (!in_task()) + return; - if (reliable_allow_fb_enabled()) - return true; + if (is_global_init(current) || (current->flags & PF_RELIABLE)) + *gfp_mask |= ___GFP_RELIABILITY; +} +/* + * return true means memory allocation need retry and flag ___GFP_RELIABILITY + * must be cleared. + */ +static inline bool check_after_alloc(gfp_t *gfp_mask, unsigned int order, + int preferred_nid, nodemask_t *nodemask, + struct page **_page) +{ + if (!mem_reliable_is_enabled()) return false; - } - /* - * Init tasks will alloc memory from non-mirrored region if their - * allocation trigger task_reliable_limit - */ - if (is_global_init(current)) { - if (reliable_mem_limit_check(1 << order) && - mem_reliable_watermark_ok(1 << order)) - *gfp_mask |= ___GFP_RELIABILITY; - return true; - } + if (!(*gfp_mask & ___GFP_RELIABILITY)) + return false; - /* - * This only check task_reliable_limit without ___GFP_RELIABILITY - * or this process is global init. - * For kernel internal mechanism(hugepaged collapse and others) - * If they alloc memory for user and obey task_reliable_limit, they - * need to check this limit before allocing pages. - */ - if ((current->flags & PF_RELIABLE) && (gfp_ori & __GFP_HIGHMEM) && - (gfp_ori & __GFP_MOVABLE)) { - if (reliable_mem_limit_check(1 << order) && - mem_reliable_watermark_ok(1 << order)) { - *gfp_mask |= ___GFP_RELIABILITY; - return true; - } + if (!*_page) + goto out_retry; - if (reliable_allow_fb_enabled()) - return true; + if (*gfp_mask & __GFP_NOFAIL) + goto out; - return false; + /* percpu counter is not initialized, ignore limit check */ + if (!mem_reliable_counter_initialized()) + goto out; + + /* spcial user task, systemd is limited by task_reliable_limit */ + if (((current->flags & PF_RELIABLE) || is_global_init(current)) && + !reliable_mem_limit_check(1 << order)) + goto out_free_page; + + goto out; + +out_free_page: + __free_pages(*_page, order); + *_page = NULL; + +out_retry: + if (reliable_allow_fb_enabled() || is_global_init(current)) { + *gfp_mask &= ~___GFP_RELIABILITY; + return true; } - return true; + if (*gfp_mask & (__GFP_NORETRY | __GFP_RETRY_MAYFAIL | __GFP_THISNODE)) + goto out; + + /* Coredumps can quickly deplete all memory reserves */ + if (current->flags & PF_DUMPCORE) + goto out; + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + + /* oom here */ + mem_reliable_out_of_memory(*gfp_mask, order, preferred_nid, + nodemask); +out: + return false; } /* @@ -4725,12 +4711,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, return NULL; } - if (!prepare_before_alloc(&gfp_mask, order)) { - mem_reliable_out_of_memory(gfp_mask, order, preferred_nid, - nodemask); - goto out; - } + prepare_before_alloc(&gfp_mask); +retry: alloc_mask = gfp_mask; if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; @@ -4767,6 +4750,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, page = NULL; } + if (check_after_alloc(&gfp_mask, order, preferred_nid, nodemask, &page)) + goto retry; + trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); return page; diff --git a/mm/page_counter.c b/mm/page_counter.c index 147ff99187b8197142746ef13cc28acf77dd05cf..58903e3bfdd3be1012e24920955ff745c041e283 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -57,9 +57,13 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_sub_return(nr_pages, &counter->usage); - propagate_protected_usage(counter, new); /* More uncharges than charges? */ - WARN_ON_ONCE(new < 0); + if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n", + new, nr_pages)) { + new = 0; + atomic_long_set(&counter->usage, new); + } + propagate_protected_usage(counter, new); } /** diff --git a/mm/share_pool.c b/mm/share_pool.c index 90733b807f12e6ecb9a868760258979a88d6942e..5ba353ddfabd9214ccd74b196d16597dd245ef65 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -15,7 +15,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - #define pr_fmt(fmt) "share pool: " fmt #include @@ -91,6 +90,7 @@ static int __read_mostly enable_share_k2u_spg = 1; int sysctl_ac_mode = AC_NONE; /* debug mode */ int sysctl_sp_debug_mode; +EXPORT_SYMBOL(sysctl_sp_debug_mode); int sysctl_share_pool_map_lock_enable; @@ -2173,6 +2173,7 @@ struct sp_alloc_context { bool need_fallocate; struct timespec64 start; struct timespec64 end; + bool have_mbind; }; static void trace_sp_alloc_begin(struct sp_alloc_context *ac) @@ -2315,6 +2316,7 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags, ac->sp_flags = sp_flags; ac->state = ALLOC_NORMAL; ac->need_fallocate = false; + ac->have_mbind = false; return 0; } @@ -2347,6 +2349,9 @@ static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa, if (spg_node) prot = spg_node->prot; + if (ac->sp_flags & SP_PROT_RO) + prot = PROT_READ; + /* when success, mmap_addr == spa->va_start */ mmap_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot); if (IS_ERR_VALUE(mmap_addr)) { @@ -2371,6 +2376,10 @@ static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa, ret = -EINVAL; goto unmap; } + + if (ac->sp_flags & SP_PROT_RO) + vma->vm_flags &= ~VM_MAYWRITE; + /* clean PTE_RDONLY flags or trigger SMMU event */ if (prot & PROT_WRITE) vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY); @@ -2408,7 +2417,7 @@ static void sp_alloc_fallback(struct sp_area *spa, struct sp_alloc_context *ac) } static int sp_alloc_populate(struct mm_struct *mm, struct sp_area *spa, - struct sp_group_node *spg_node, struct sp_alloc_context *ac) + struct sp_alloc_context *ac) { int ret = 0; unsigned long sp_addr = spa->va_start; @@ -2440,25 +2449,19 @@ static int sp_alloc_populate(struct mm_struct *mm, struct sp_area *spa, if (ret) sp_add_work_compact(); } - if (ret) { - if (spa->spg != spg_none) - sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node); - else - sp_munmap(mm, spa->va_start, spa->real_size); - - if (unlikely(fatal_signal_pending(current))) - pr_warn_ratelimited("allocation failed, current thread is killed\n"); - else - pr_warn_ratelimited("allocation failed due to mm populate failed" - "(potential no enough memory when -12): %d\n", ret); - sp_fallocate(spa); /* need this, otherwise memleak */ - sp_alloc_fallback(spa, ac); - } else { - ac->need_fallocate = true; - } return ret; } +static long sp_mbind(struct mm_struct *mm, unsigned long start, unsigned long len, unsigned long node) +{ + nodemask_t nmask; + + nodes_clear(nmask); + node_set(node, nmask); + return __do_mbind(start, len, MPOL_BIND, MPOL_F_STATIC_NODES, + &nmask, MPOL_MF_STRICT, mm); +} + static int __sp_alloc_mmap_populate(struct mm_struct *mm, struct sp_area *spa, struct sp_group_node *spg_node, struct sp_alloc_context *ac) { @@ -2474,7 +2477,34 @@ static int __sp_alloc_mmap_populate(struct mm_struct *mm, struct sp_area *spa, return ret; } - ret = sp_alloc_populate(mm, spa, spg_node, ac); + if (!ac->have_mbind) { + ret = sp_mbind(mm, spa->va_start, spa->real_size, spa->node_id); + if (ret < 0) { + pr_err("cannot bind the memory range to specified node:%d, err:%d\n", + spa->node_id, ret); + goto err; + } + ac->have_mbind = true; + } + + ret = sp_alloc_populate(mm, spa, ac); + if (ret) { +err: + if (spa->spg != spg_none) + sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node); + else + sp_munmap(mm, spa->va_start, spa->real_size); + + if (unlikely(fatal_signal_pending(current))) + pr_warn_ratelimited("allocation failed, current thread is killed\n"); + else + pr_warn_ratelimited("allocation failed due to mm populate failed(potential no enough memory when -12): %d\n", + ret); + sp_fallocate(spa); /* need this, otherwise memleak */ + sp_alloc_fallback(spa, ac); + } else + ac->need_fallocate = true; + return ret; } @@ -2496,11 +2526,6 @@ static int sp_alloc_mmap_populate(struct sp_area *spa, if (mmap_ret) { if (ac->state != ALLOC_COREDUMP) return mmap_ret; - if (ac->spg == spg_none) { - sp_alloc_unmap(mm, spa, spg_node); - pr_err("dvpp allocation failed due to coredump"); - return mmap_ret; - } ac->state = ALLOC_NORMAL; continue; } @@ -2648,6 +2673,9 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, goto put_mm; } + if (kc && kc->sp_flags & SP_PROT_RO) + prot = PROT_READ; + ret_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot); if (IS_ERR_VALUE(ret_addr)) { pr_debug("k2u mmap failed %lx\n", ret_addr); @@ -2660,6 +2688,9 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, if (prot & PROT_WRITE) vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY); + if (kc && kc->sp_flags & SP_PROT_RO) + vma->vm_flags &= ~VM_MAYWRITE; + if (is_vm_hugetlb_page(vma)) { ret = remap_vmalloc_hugepage_range(vma, (void *)kva, 0); if (ret) { @@ -2711,6 +2742,7 @@ static void *sp_make_share_kva_to_task(unsigned long kva, unsigned long size, un struct sp_area *spa; struct spg_proc_stat *stat; unsigned long prot = PROT_READ | PROT_WRITE; + struct sp_k2u_context kc; down_write(&sp_group_sem); stat = sp_init_process_stat(current, current->mm, spg_none); @@ -2729,8 +2761,8 @@ static void *sp_make_share_kva_to_task(unsigned long kva, unsigned long size, un } spa->kva = kva; - - uva = (void *)sp_remap_kva_to_vma(kva, spa, current->mm, prot, NULL); + kc.sp_flags = sp_flags; + uva = (void *)sp_remap_kva_to_vma(kva, spa, current->mm, prot, &kc); __sp_area_drop(spa); if (IS_ERR(uva)) pr_err("remap k2u to task failed %ld\n", PTR_ERR(uva)); @@ -4076,9 +4108,9 @@ void spg_overview_show(struct seq_file *seq) atomic_read(&sp_overall_stat.spa_total_num)); } - down_read(&sp_group_sem); + down_read(&sp_spg_stat_sem); idr_for_each(&sp_spg_stat_idr, idr_spg_stat_cb, seq); - up_read(&sp_group_sem); + up_read(&sp_spg_stat_sem); if (seq != NULL) seq_puts(seq, "\n"); @@ -4117,7 +4149,6 @@ static int idr_proc_stat_cb(int id, void *p, void *data) long sp_res, sp_res_nsize, non_sp_res, non_sp_shm; /* to prevent ABBA deadlock, first hold sp_group_sem */ - down_read(&sp_group_sem); mutex_lock(&spg_stat->lock); hash_for_each(spg_stat->hash, i, spg_proc_stat, gnode) { proc_stat = spg_proc_stat->proc_stat; @@ -4146,7 +4177,6 @@ static int idr_proc_stat_cb(int id, void *p, void *data) seq_putc(seq, '\n'); } mutex_unlock(&spg_stat->lock); - up_read(&sp_group_sem); return 0; } @@ -4164,10 +4194,16 @@ static int proc_stat_show(struct seq_file *seq, void *offset) byte2kb(atomic64_read(&kthread_stat.alloc_size)), byte2kb(atomic64_read(&kthread_stat.k2u_size))); - /* pay attention to potential ABBA deadlock */ + /* + * This ugly code is just for fixing the ABBA deadlock against + * sp_group_add_task. + */ + down_read(&sp_group_sem); down_read(&sp_spg_stat_sem); idr_for_each(&sp_spg_stat_idr, idr_proc_stat_cb, seq); up_read(&sp_spg_stat_sem); + up_read(&sp_group_sem); + return 0; } diff --git a/mm/shmem.c b/mm/shmem.c index bc62dc7327812e3bd2e8de6e762bb76166bde1f6..8915a5b9ad0a5f5800bb63d32cadd414c8ea1f46 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -541,7 +541,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shmem_inode_info *info; struct page *page; unsigned long batch = sc ? sc->nr_to_scan : 128; - int removed = 0, split = 0; + int split = 0; if (list_empty(&sbinfo->shrinklist)) return SHRINK_STOP; @@ -556,7 +556,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, /* inode is about to be evicted */ if (!inode) { list_del_init(&info->shrinklist); - removed++; goto next; } @@ -564,12 +563,12 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, if (round_up(inode->i_size, PAGE_SIZE) == round_up(inode->i_size, HPAGE_PMD_SIZE)) { list_move(&info->shrinklist, &to_remove); - removed++; goto next; } list_move(&info->shrinklist, &list); next: + sbinfo->shrinklist_len--; if (!--batch) break; } @@ -589,7 +588,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, inode = &info->vfs_inode; if (nr_to_split && split >= nr_to_split) - goto leave; + goto move_back; page = find_get_page(inode->i_mapping, (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); @@ -603,38 +602,44 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, } /* - * Leave the inode on the list if we failed to lock - * the page at this time. + * Move the inode on the list back to shrinklist if we failed + * to lock the page at this time. * * Waiting for the lock may lead to deadlock in the * reclaim path. */ if (!trylock_page(page)) { put_page(page); - goto leave; + goto move_back; } ret = split_huge_page(page); unlock_page(page); put_page(page); - /* If split failed leave the inode on the list */ + /* If split failed move the inode on the list back to shrinklist */ if (ret) - goto leave; + goto move_back; split++; drop: list_del_init(&info->shrinklist); - removed++; -leave: + goto put; +move_back: + /* + * Make sure the inode is either on the global list or deleted + * from any local list before iput() since it could be deleted + * in another thread once we put the inode (then the local list + * is corrupted). + */ + spin_lock(&sbinfo->shrinklist_lock); + list_move(&info->shrinklist, &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + spin_unlock(&sbinfo->shrinklist_lock); +put: iput(inode); } - spin_lock(&sbinfo->shrinklist_lock); - list_splice_tail(&list, &sbinfo->shrinklist); - sbinfo->shrinklist_len -= removed; - spin_unlock(&sbinfo->shrinklist_lock); - return split; } @@ -728,6 +733,7 @@ static int shmem_add_to_page_cache(struct page *page, __inc_node_page_state(page, NR_SHMEM_THPS); __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); + shmem_reliable_page_counter(page, nr); xa_unlock_irq(&mapping->i_pages); } else { page->mapping = NULL; @@ -753,6 +759,7 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) mapping->nrpages--; __dec_node_page_state(page, NR_FILE_PAGES); __dec_node_page_state(page, NR_SHMEM); + shmem_reliable_page_counter(page, -1); xa_unlock_irq(&mapping->i_pages); put_page(page); BUG_ON(error); @@ -957,8 +964,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, truncate_inode_page(mapping, page); } } - shmem_reliable_page_counter( - page, -(1 << compound_order(page))); unlock_page(page); } pagevec_remove_exceptionals(&pvec); @@ -1069,8 +1074,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, break; } } - shmem_reliable_page_counter( - page, -(1 << compound_order(page))); unlock_page(page); } pagevec_remove_exceptionals(&pvec); @@ -1976,7 +1979,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); - shmem_reliable_page_counter(page, 1 << compound_order(page)); alloced = true; if (PageTransHuge(page) && diff --git a/mm/vmscan.c b/mm/vmscan.c index d59b22a7ba9a2dad5694dbeb3742c420ebf218fa..42f24473756b7792bec5d3640402dbfe072d1d5b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -185,7 +185,7 @@ int vm_cache_reclaim_weight __read_mostly; int vm_cache_reclaim_weight_min; int vm_cache_reclaim_weight_max; int vm_cache_reclaim_enable; -static DEFINE_PER_CPU(struct delayed_work, vmscan_work); +static struct work_struct vmscan_works[MAX_NUMNODES]; #endif static LIST_HEAD(shrinker_list); @@ -3972,8 +3972,8 @@ static unsigned long __shrink_page_cache(gfp_t mask) .gfp_mask = current_gfp_context(mask), .reclaim_idx = gfp_zone(mask), .may_writepage = !laptop_mode, - .nr_to_reclaim = SWAP_CLUSTER_MAX * - (unsigned long)vm_cache_reclaim_weight, + .nr_to_reclaim = SWAP_CLUSTER_MAX * nr_cpus_node(numa_node_id()) * + vm_cache_reclaim_weight, .may_unmap = 1, .may_swap = mem_reliable_is_enabled() ? 0 : 1, .no_shrink_slab = mem_reliable_is_enabled() ? 0 : 1, @@ -3997,26 +3997,21 @@ static DECLARE_DEFERRABLE_WORK(shepherd, shrink_shepherd); static void shrink_shepherd(struct work_struct *w) { - int cpu; - get_online_cpus(); + int node; - for_each_online_cpu(cpu) { - struct delayed_work *work = &per_cpu(vmscan_work, cpu); - - if (!delayed_work_pending(work) && vm_cache_reclaim_enable) - queue_delayed_work_on(cpu, system_wq, work, 0); + for_each_online_node(node) { + if (!work_pending(&vmscan_works[node]) && vm_cache_reclaim_enable) + queue_work_node(node, system_unbound_wq, &vmscan_works[node]); } - put_online_cpus(); - /* we want all kernel thread to stop */ if (vm_cache_reclaim_enable) { if (vm_cache_reclaim_s == 0) - schedule_delayed_work(&shepherd, + queue_delayed_work(system_unbound_wq, &shepherd, round_jiffies_relative(120 * HZ)); else - schedule_delayed_work(&shepherd, + queue_delayed_work(system_unbound_wq, &shepherd, round_jiffies_relative((unsigned long) vm_cache_reclaim_s * HZ)); } @@ -4024,15 +4019,12 @@ static void shrink_shepherd(struct work_struct *w) static void shrink_shepherd_timer(void) { - int cpu; - - for_each_possible_cpu(cpu) { - struct delayed_work *work = &per_cpu(vmscan_work, cpu); + int i; - INIT_DEFERRABLE_WORK(work, shrink_page_cache_work); - } + for (i = 0; i < MAX_NUMNODES; i++) + INIT_WORK(&vmscan_works[i], shrink_page_cache_work); - schedule_delayed_work(&shepherd, + queue_delayed_work(system_unbound_wq, &shepherd, round_jiffies_relative((unsigned long)vm_cache_reclaim_s * HZ)); } @@ -4048,9 +4040,6 @@ unsigned long shrink_page_cache(gfp_t mask) static void shrink_page_cache_work(struct work_struct *w) { - struct delayed_work *work = to_delayed_work(w); - unsigned long nr_pages; - /* * if vm_cache_reclaim_enable or vm_cache_reclaim_s is zero, * we do not shrink page cache again. @@ -4063,10 +4052,7 @@ static void shrink_page_cache_work(struct work_struct *w) return; /* It should wait more time if we hardly reclaim the page cache */ - nr_pages = shrink_page_cache(GFP_KERNEL); - if ((nr_pages < SWAP_CLUSTER_MAX) && vm_cache_reclaim_enable) - queue_delayed_work_on(smp_processor_id(), system_wq, work, - round_jiffies_relative((vm_cache_reclaim_s + 120) * HZ)); + shrink_page_cache(GFP_KERNEL); } static void shrink_page_cache_init(void) @@ -4088,13 +4074,6 @@ static void shrink_page_cache_init(void) shrink_shepherd_timer(); } -static int kswapd_cpu_down_prep(unsigned int cpu) -{ - cancel_delayed_work_sync(&per_cpu(vmscan_work, cpu)); - - return 0; -} - int cache_reclaim_enable_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -4105,8 +4084,8 @@ int cache_reclaim_enable_handler(struct ctl_table *table, int write, return ret; if (write) - schedule_delayed_work(&shepherd, round_jiffies_relative( - (unsigned long)vm_cache_reclaim_s * HZ)); + queue_delayed_work(system_unbound_wq, &shepherd, + round_jiffies_relative((unsigned long)vm_cache_reclaim_s * HZ)); return 0; } @@ -4121,7 +4100,7 @@ int cache_reclaim_sysctl_handler(struct ctl_table *table, int write, return ret; if (write) - mod_delayed_work(system_wq, &shepherd, + mod_delayed_work(system_unbound_wq, &shepherd, round_jiffies_relative( (unsigned long)vm_cache_reclaim_s * HZ)); @@ -4194,15 +4173,9 @@ static int __init kswapd_init(void) swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); -#ifdef CONFIG_SHRINK_PAGECACHE - ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "mm/vmscan:online", kswapd_cpu_online, - kswapd_cpu_down_prep); -#else ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmscan:online", kswapd_cpu_online, NULL); -#endif WARN_ON(ret < 0); #ifdef CONFIG_SHRINK_PAGECACHE shrink_page_cache_init(); diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index 9daab0dd833b31cbb663b7756970a545cc59f3cf..81002cba88b322e19781d1675c6531766faefe76 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -301,9 +301,9 @@ static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv) ref = priv->rings[i].intf->ref[j]; gnttab_end_foreign_access(ref, 0, 0); } - free_pages((unsigned long)priv->rings[i].data.in, - XEN_9PFS_RING_ORDER - - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + free_pages_exact(priv->rings[i].data.in, + 1UL << (XEN_9PFS_RING_ORDER + + XEN_PAGE_SHIFT)); } gnttab_end_foreign_access(priv->rings[i].ref, 0, 0); free_page((unsigned long)priv->rings[i].intf); @@ -341,8 +341,8 @@ static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev, if (ret < 0) goto out; ring->ref = ret; - bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - XEN_9PFS_RING_ORDER - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + bytes = alloc_pages_exact(1UL << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT), + GFP_KERNEL | __GFP_ZERO); if (!bytes) { ret = -ENOMEM; goto out; @@ -373,9 +373,7 @@ static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev, if (bytes) { for (i--; i >= 0; i--) gnttab_end_foreign_access(ring->intf->ref[i], 0, 0); - free_pages((unsigned long)bytes, - XEN_9PFS_RING_ORDER - - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + free_pages_exact(bytes, 1UL << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT)); } gnttab_end_foreign_access(ring->ref, 0, 0); free_page((unsigned long)ring->intf); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 44ec492f3dc2f42e059939344cab4064ffbca547..4ecdde530265494f1a1cced83b085bac0318c879 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -80,6 +80,7 @@ static void ax25_kill_by_device(struct net_device *dev) { ax25_dev *ax25_dev; ax25_cb *s; + struct sock *sk; if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) return; @@ -88,11 +89,26 @@ static void ax25_kill_by_device(struct net_device *dev) again: ax25_for_each(s, &ax25_list) { if (s->ax25_dev == ax25_dev) { - s->ax25_dev = NULL; + sk = s->sk; + if (!sk) { + spin_unlock_bh(&ax25_list_lock); + ax25_disconnect(s, ENETUNREACH); + s->ax25_dev = NULL; + spin_lock_bh(&ax25_list_lock); + goto again; + } + sock_hold(sk); spin_unlock_bh(&ax25_list_lock); + lock_sock(sk); ax25_disconnect(s, ENETUNREACH); + s->ax25_dev = NULL; + if (sk->sk_socket) { + dev_put(ax25_dev->dev); + ax25_dev_put(ax25_dev); + } + release_sock(sk); spin_lock_bh(&ax25_list_lock); - + sock_put(sk); /* The entry could have been deleted from the * list meanwhile and thus the next pointer is * no longer valid. Play it safe and restart @@ -356,21 +372,25 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) if (copy_from_user(&ax25_ctl, arg, sizeof(ax25_ctl))) return -EFAULT; - if ((ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr)) == NULL) - return -ENODEV; - if (ax25_ctl.digi_count > AX25_MAX_DIGIS) return -EINVAL; if (ax25_ctl.arg > ULONG_MAX / HZ && ax25_ctl.cmd != AX25_KILL) return -EINVAL; + ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr); + if (!ax25_dev) + return -ENODEV; + digi.ndigi = ax25_ctl.digi_count; for (k = 0; k < digi.ndigi; k++) digi.calls[k] = ax25_ctl.digi_addr[k]; - if ((ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev)) == NULL) + ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev); + if (!ax25) { + ax25_dev_put(ax25_dev); return -ENOTCONN; + } switch (ax25_ctl.cmd) { case AX25_KILL: @@ -437,6 +457,7 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) } out_put: + ax25_dev_put(ax25_dev); ax25_cb_put(ax25); return ret; @@ -960,14 +981,16 @@ static int ax25_release(struct socket *sock) { struct sock *sk = sock->sk; ax25_cb *ax25; + ax25_dev *ax25_dev; if (sk == NULL) return 0; sock_hold(sk); - sock_orphan(sk); lock_sock(sk); + sock_orphan(sk); ax25 = sk_to_ax25(sk); + ax25_dev = ax25->ax25_dev; if (sk->sk_type == SOCK_SEQPACKET) { switch (ax25->state) { @@ -1029,6 +1052,15 @@ static int ax25_release(struct socket *sock) sk->sk_state_change(sk); ax25_destroy_socket(ax25); } + if (ax25_dev) { + del_timer_sync(&ax25->timer); + del_timer_sync(&ax25->t1timer); + del_timer_sync(&ax25->t2timer); + del_timer_sync(&ax25->t3timer); + del_timer_sync(&ax25->idletimer); + dev_put(ax25_dev->dev); + ax25_dev_put(ax25_dev); + } sock->sk = NULL; release_sock(sk); @@ -1105,8 +1137,10 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) } } - if (ax25_dev != NULL) + if (ax25_dev) { ax25_fillin_cb(ax25, ax25_dev); + dev_hold(ax25_dev->dev); + } done: ax25_cb_add(ax25); diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index d92195cd78349f2d50eefdc31ac186087c2c77c3..55a611f7239bc3ec23319522e4831efa3f2d183e 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -40,6 +40,7 @@ ax25_dev *ax25_addr_ax25dev(ax25_address *addr) for (ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next) if (ax25cmp(addr, (ax25_address *)ax25_dev->dev->dev_addr) == 0) { res = ax25_dev; + ax25_dev_hold(ax25_dev); } spin_unlock_bh(&ax25_dev_lock); @@ -59,6 +60,7 @@ void ax25_dev_device_up(struct net_device *dev) return; } + refcount_set(&ax25_dev->refcount, 1); dev->ax25_ptr = ax25_dev; ax25_dev->dev = dev; dev_hold(dev); @@ -87,6 +89,7 @@ void ax25_dev_device_up(struct net_device *dev) ax25_dev->next = ax25_dev_list; ax25_dev_list = ax25_dev; spin_unlock_bh(&ax25_dev_lock); + ax25_dev_hold(ax25_dev); ax25_register_dev_sysctl(ax25_dev); } @@ -116,9 +119,10 @@ void ax25_dev_device_down(struct net_device *dev) if ((s = ax25_dev_list) == ax25_dev) { ax25_dev_list = s->next; spin_unlock_bh(&ax25_dev_lock); + ax25_dev_put(ax25_dev); dev->ax25_ptr = NULL; dev_put(dev); - kfree(ax25_dev); + ax25_dev_put(ax25_dev); return; } @@ -126,9 +130,10 @@ void ax25_dev_device_down(struct net_device *dev) if (s->next == ax25_dev) { s->next = ax25_dev->next; spin_unlock_bh(&ax25_dev_lock); + ax25_dev_put(ax25_dev); dev->ax25_ptr = NULL; dev_put(dev); - kfree(ax25_dev); + ax25_dev_put(ax25_dev); return; } @@ -136,6 +141,7 @@ void ax25_dev_device_down(struct net_device *dev) } spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; + ax25_dev_put(ax25_dev); } int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) @@ -147,20 +153,32 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) switch (cmd) { case SIOCAX25ADDFWD: - if ((fwd_dev = ax25_addr_ax25dev(&fwd->port_to)) == NULL) + fwd_dev = ax25_addr_ax25dev(&fwd->port_to); + if (!fwd_dev) { + ax25_dev_put(ax25_dev); return -EINVAL; - if (ax25_dev->forward != NULL) + } + if (ax25_dev->forward) { + ax25_dev_put(fwd_dev); + ax25_dev_put(ax25_dev); return -EINVAL; + } ax25_dev->forward = fwd_dev->dev; + ax25_dev_put(fwd_dev); + ax25_dev_put(ax25_dev); break; case SIOCAX25DELFWD: - if (ax25_dev->forward == NULL) + if (!ax25_dev->forward) { + ax25_dev_put(ax25_dev); return -EINVAL; + } ax25_dev->forward = NULL; + ax25_dev_put(ax25_dev); break; default: + ax25_dev_put(ax25_dev); return -EINVAL; } diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index 66d54fc11831c0de951275cdd6a25996d5cf099c..8f81de88f0066e9e8d930e96c36097909585db49 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -78,11 +78,13 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_dev *ax25_dev; int i; - if ((ax25_dev = ax25_addr_ax25dev(&route->port_addr)) == NULL) - return -EINVAL; if (route->digi_count > AX25_MAX_DIGIS) return -EINVAL; + ax25_dev = ax25_addr_ax25dev(&route->port_addr); + if (!ax25_dev) + return -EINVAL; + write_lock_bh(&ax25_route_lock); ax25_rt = ax25_route_list; @@ -94,6 +96,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return -ENOMEM; } ax25_rt->digipeat->lastrepeat = -1; @@ -104,6 +107,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) } } write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } ax25_rt = ax25_rt->next; @@ -111,6 +115,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) if ((ax25_rt = kmalloc(sizeof(ax25_route), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return -ENOMEM; } @@ -123,6 +128,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); kfree(ax25_rt); + ax25_dev_put(ax25_dev); return -ENOMEM; } ax25_rt->digipeat->lastrepeat = -1; @@ -135,6 +141,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->next = ax25_route_list; ax25_route_list = ax25_rt; write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } @@ -176,6 +183,7 @@ static int ax25_rt_del(struct ax25_routes_struct *route) } } write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } @@ -218,6 +226,7 @@ static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option) out: write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return err; } diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c index 038b109b2be70e8da91589a1bf3c82e73a3903ca..c129865cad9f475f5a426115d3d88a01cd4b31ef 100644 --- a/net/ax25/ax25_subr.c +++ b/net/ax25/ax25_subr.c @@ -264,12 +264,20 @@ void ax25_disconnect(ax25_cb *ax25, int reason) { ax25_clear_queues(ax25); - if (!ax25->sk || !sock_flag(ax25->sk, SOCK_DESTROY)) - ax25_stop_heartbeat(ax25); - ax25_stop_t1timer(ax25); - ax25_stop_t2timer(ax25); - ax25_stop_t3timer(ax25); - ax25_stop_idletimer(ax25); + if (reason == ENETUNREACH) { + del_timer_sync(&ax25->timer); + del_timer_sync(&ax25->t1timer); + del_timer_sync(&ax25->t2timer); + del_timer_sync(&ax25->t3timer); + del_timer_sync(&ax25->idletimer); + } else { + if (!ax25->sk || !sock_flag(ax25->sk, SOCK_DESTROY)) + ax25_stop_heartbeat(ax25); + ax25_stop_t1timer(ax25); + ax25_stop_t2timer(ax25); + ax25_stop_t3timer(ax25); + ax25_stop_idletimer(ax25); + } ax25->state = AX25_STATE_0; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index a403179eea49080d25d23f5aeae30c499312fec9..fd5e3470560a53b23f7364c710ffb468877e0eb8 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -727,6 +727,9 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff if (nf_bridge->frag_max_size && nf_bridge->frag_max_size < mtu) mtu = nf_bridge->frag_max_size; + nf_bridge_update_protocol(skb); + nf_bridge_push_encap_header(skb); + if (skb_is_gso(skb) || skb->len + mtu_reserved <= mtu) { nf_bridge_info_free(skb); return br_dev_queue_push_xmit(net, sk, skb); @@ -744,8 +747,6 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; - nf_bridge_update_protocol(skb); - data = this_cpu_ptr(&brnf_frag_data_storage); data->vlan_tci = skb->vlan_tci; @@ -768,8 +769,6 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; - nf_bridge_update_protocol(skb); - data = this_cpu_ptr(&brnf_frag_data_storage); data->encap_size = nf_bridge_encap_header_len(skb); data->size = ETH_HLEN + data->encap_size; diff --git a/net/can/bcm.c b/net/can/bcm.c index e66377e764ba9f0b288b85a042da22b3bc46d062..ffc34e911808389241402962bfb6135e9b9f107f 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -105,7 +105,6 @@ struct bcm_op { unsigned long frames_abs, frames_filtered; struct bcm_timeval ival1, ival2; struct hrtimer timer, thrtimer; - struct tasklet_struct tsklet, thrtsklet; ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg; int rx_ifindex; int cfsiz; @@ -370,25 +369,34 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, } } -static void bcm_tx_start_timer(struct bcm_op *op) +static bool bcm_tx_set_expiry(struct bcm_op *op, struct hrtimer *hrt) { + ktime_t ival; + if (op->kt_ival1 && op->count) - hrtimer_start(&op->timer, - ktime_add(ktime_get(), op->kt_ival1), - HRTIMER_MODE_ABS); + ival = op->kt_ival1; else if (op->kt_ival2) - hrtimer_start(&op->timer, - ktime_add(ktime_get(), op->kt_ival2), - HRTIMER_MODE_ABS); + ival = op->kt_ival2; + else + return false; + + hrtimer_set_expires(hrt, ktime_add(ktime_get(), ival)); + return true; } -static void bcm_tx_timeout_tsklet(unsigned long data) +static void bcm_tx_start_timer(struct bcm_op *op) { - struct bcm_op *op = (struct bcm_op *)data; + if (bcm_tx_set_expiry(op, &op->timer)) + hrtimer_start_expires(&op->timer, HRTIMER_MODE_ABS_SOFT); +} + +/* bcm_tx_timeout_handler - performs cyclic CAN frame transmissions */ +static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) +{ + struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); struct bcm_msg_head msg_head; if (op->kt_ival1 && (op->count > 0)) { - op->count--; if (!op->count && (op->flags & TX_COUNTEVT)) { @@ -406,22 +414,12 @@ static void bcm_tx_timeout_tsklet(unsigned long data) } bcm_can_tx(op); - } else if (op->kt_ival2) + } else if (op->kt_ival2) { bcm_can_tx(op); + } - bcm_tx_start_timer(op); -} - -/* - * bcm_tx_timeout_handler - performs cyclic CAN frame transmissions - */ -static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) -{ - struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); - - tasklet_schedule(&op->tsklet); - - return HRTIMER_NORESTART; + return bcm_tx_set_expiry(op, &op->timer) ? + HRTIMER_RESTART : HRTIMER_NORESTART; } /* @@ -488,7 +486,7 @@ static void bcm_rx_update_and_send(struct bcm_op *op, /* do not send the saved data - only start throttle timer */ hrtimer_start(&op->thrtimer, ktime_add(op->kt_lastmsg, op->kt_ival2), - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_SOFT); return; } @@ -547,14 +545,21 @@ static void bcm_rx_starttimer(struct bcm_op *op) return; if (op->kt_ival1) - hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL); + hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT); } -static void bcm_rx_timeout_tsklet(unsigned long data) +/* bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out */ +static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) { - struct bcm_op *op = (struct bcm_op *)data; + struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); struct bcm_msg_head msg_head; + /* if user wants to be informed, when cyclic CAN-Messages come back */ + if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) { + /* clear received CAN frames to indicate 'nothing received' */ + memset(op->last_frames, 0, op->nframes * op->cfsiz); + } + /* create notification to user */ memset(&msg_head, 0, sizeof(msg_head)); msg_head.opcode = RX_TIMEOUT; @@ -566,25 +571,6 @@ static void bcm_rx_timeout_tsklet(unsigned long data) msg_head.nframes = 0; bcm_send_to_user(op, &msg_head, NULL, 0); -} - -/* - * bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out - */ -static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) -{ - struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); - - /* schedule before NET_RX_SOFTIRQ */ - tasklet_hi_schedule(&op->tsklet); - - /* no restart of the timer is done here! */ - - /* if user wants to be informed, when cyclic CAN-Messages come back */ - if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) { - /* clear received CAN frames to indicate 'nothing received' */ - memset(op->last_frames, 0, op->nframes * op->cfsiz); - } return HRTIMER_NORESTART; } @@ -592,14 +578,12 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) /* * bcm_rx_do_flush - helper for bcm_rx_thr_flush */ -static inline int bcm_rx_do_flush(struct bcm_op *op, int update, - unsigned int index) +static inline int bcm_rx_do_flush(struct bcm_op *op, unsigned int index) { struct canfd_frame *lcf = op->last_frames + op->cfsiz * index; if ((op->last_frames) && (lcf->flags & RX_THR)) { - if (update) - bcm_rx_changed(op, lcf); + bcm_rx_changed(op, lcf); return 1; } return 0; @@ -607,11 +591,8 @@ static inline int bcm_rx_do_flush(struct bcm_op *op, int update, /* * bcm_rx_thr_flush - Check for throttled data and send it to the userspace - * - * update == 0 : just check if throttled data is available (any irq context) - * update == 1 : check and send throttled data to userspace (soft_irq context) */ -static int bcm_rx_thr_flush(struct bcm_op *op, int update) +static int bcm_rx_thr_flush(struct bcm_op *op) { int updated = 0; @@ -620,24 +601,16 @@ static int bcm_rx_thr_flush(struct bcm_op *op, int update) /* for MUX filter we start at index 1 */ for (i = 1; i < op->nframes; i++) - updated += bcm_rx_do_flush(op, update, i); + updated += bcm_rx_do_flush(op, i); } else { /* for RX_FILTER_ID and simple filter */ - updated += bcm_rx_do_flush(op, update, 0); + updated += bcm_rx_do_flush(op, 0); } return updated; } -static void bcm_rx_thr_tsklet(unsigned long data) -{ - struct bcm_op *op = (struct bcm_op *)data; - - /* push the changed data to the userspace */ - bcm_rx_thr_flush(op, 1); -} - /* * bcm_rx_thr_handler - the time for blocked content updates is over now: * Check for throttled data and send it to the userspace @@ -646,9 +619,7 @@ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer) { struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer); - tasklet_schedule(&op->thrtsklet); - - if (bcm_rx_thr_flush(op, 0)) { + if (bcm_rx_thr_flush(op)) { hrtimer_forward(hrtimer, ktime_get(), op->kt_ival2); return HRTIMER_RESTART; } else { @@ -744,23 +715,8 @@ static struct bcm_op *bcm_find_op(struct list_head *ops, static void bcm_remove_op(struct bcm_op *op) { - if (op->tsklet.func) { - while (test_bit(TASKLET_STATE_SCHED, &op->tsklet.state) || - test_bit(TASKLET_STATE_RUN, &op->tsklet.state) || - hrtimer_active(&op->timer)) { - hrtimer_cancel(&op->timer); - tasklet_kill(&op->tsklet); - } - } - - if (op->thrtsklet.func) { - while (test_bit(TASKLET_STATE_SCHED, &op->thrtsklet.state) || - test_bit(TASKLET_STATE_RUN, &op->thrtsklet.state) || - hrtimer_active(&op->thrtimer)) { - hrtimer_cancel(&op->thrtimer); - tasklet_kill(&op->thrtsklet); - } - } + hrtimer_cancel(&op->timer); + hrtimer_cancel(&op->thrtimer); if ((op->frames) && (op->frames != &op->sframe)) kfree(op->frames); @@ -994,15 +950,13 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ - hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&op->timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); op->timer.function = bcm_tx_timeout_handler; - /* initialize tasklet for tx countevent notification */ - tasklet_init(&op->tsklet, bcm_tx_timeout_tsklet, - (unsigned long) op); - /* currently unused in tx_ops */ - hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); /* add this bcm_op to the list of the tx_ops */ list_add(&op->list, &bo->tx_ops); @@ -1171,20 +1125,14 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->rx_ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ - hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&op->timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); op->timer.function = bcm_rx_timeout_handler; - /* initialize tasklet for rx timeout notification */ - tasklet_init(&op->tsklet, bcm_rx_timeout_tsklet, - (unsigned long) op); - - hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); op->thrtimer.function = bcm_rx_thr_handler; - /* initialize tasklet for rx throttle handling */ - tasklet_init(&op->thrtsklet, bcm_rx_thr_tsklet, - (unsigned long) op); - /* add this bcm_op to the list of the rx_ops */ list_add(&op->list, &bo->rx_ops); @@ -1230,12 +1178,12 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, */ op->kt_lastmsg = 0; hrtimer_cancel(&op->thrtimer); - bcm_rx_thr_flush(op, 1); + bcm_rx_thr_flush(op); } if ((op->flags & STARTTIMER) && op->kt_ival1) hrtimer_start(&op->timer, op->kt_ival1, - HRTIMER_MODE_REL); + HRTIMER_MODE_REL_SOFT); } /* now we can register for can_ids, if we added a new bcm_op */ diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 3978a5e8d261c755eef96468f820188ef62a297a..2ed6000126408960518bbd059b3300213115a733 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -219,13 +219,17 @@ static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi, rcu_read_lock(); list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + struct net_device *dev; + /* * only add a note to our monitor buffer if: * 1) this is the dev we received on * 2) its after the last_rx delta * 3) our rx_dropped count has gone up */ - if ((new_stat->dev == napi->dev) && + /* Paired with WRITE_ONCE() in dropmon_net_event() */ + dev = READ_ONCE(new_stat->dev); + if ((dev == napi->dev) && (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) && (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { trace_drop_common(NULL, NULL); @@ -340,7 +344,10 @@ static int dropmon_net_event(struct notifier_block *ev_block, mutex_lock(&trace_state_mutex); list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) { if (new_stat->dev == dev) { - new_stat->dev = NULL; + + /* Paired with READ_ONCE() in trace_napi_poll_hit() */ + WRITE_ONCE(new_stat->dev, NULL); + if (trace_state == TRACE_OFF) { list_del_rcu(&new_stat->list); kfree_rcu(new_stat, rcu); diff --git a/net/core/filter.c b/net/core/filter.c index 5f24a6b82802413cc4721e06fafc43cf8bbba9fb..0730395918e0fd9325132f8f43a9de764ea40cb5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5528,9 +5528,9 @@ void bpf_warn_invalid_xdp_action(u32 act) { const u32 act_max = XDP_REDIRECT; - WARN_ONCE(1, "%s XDP return value %u, expect packet loss!\n", - act > act_max ? "Illegal" : "Driver unsupported", - act); + pr_warn_once("%s XDP return value %u, expect packet loss!\n", + act > act_max ? "Illegal" : "Driver unsupported", + act); } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index fe0d255d66c8bc54ef83a3016b468f8ce6dcc9e2..7a11b2d90975dc634a5760ba60fa78dba40aeb42 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -203,7 +203,7 @@ static ssize_t speed_show(struct device *dev, if (!rtnl_trylock()) return restart_syscall(); - if (netif_running(netdev)) { + if (netif_running(netdev) && netif_device_present(netdev)) { struct ethtool_link_ksettings cmd; if (!__ethtool_get_link_ksettings(netdev, &cmd)) @@ -1616,6 +1616,9 @@ static void remove_queue_kobjects(struct net_device *dev) net_rx_queue_update_kobjects(dev, real_rx, 0); netdev_queue_update_kobjects(dev, real_tx, 0); + + dev->real_num_rx_queues = 0; + dev->real_num_tx_queues = 0; #ifdef CONFIG_SYSFS kset_unregister(dev->queues_kset); #endif diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index c60123dff803912c398cb6fa9e25a08f326fd879..f4aacbb76fa51564d74b995b712296b1718a8951 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -149,8 +149,10 @@ static void ops_exit_list(const struct pernet_operations *ops, { struct net *net; if (ops->exit) { - list_for_each_entry(net, net_exit_list, exit_list) + list_for_each_entry(net, net_exit_list, exit_list) { ops->exit(net); + cond_resched(); + } } if (ops->exit_batch) ops->exit_batch(net_exit_list); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 83de32e34bb55032e2935b4416bf9eca6b97dc08..2837cc03f69e200068099d523855bfc38d1aed2d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -130,6 +130,12 @@ int rtnl_is_locked(void) } EXPORT_SYMBOL(rtnl_is_locked); +bool refcount_dec_and_rtnl_lock(refcount_t *r) +{ + return refcount_dec_and_mutex_lock(r, &rtnl_mutex); +} +EXPORT_SYMBOL(refcount_dec_and_rtnl_lock); + #ifdef CONFIG_PROVE_LOCKING bool lockdep_rtnl_is_held(void) { @@ -2936,9 +2942,9 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; - const struct rtnl_link_ops *m_ops = NULL; + const struct rtnl_link_ops *m_ops; struct net_device *dev; - struct net_device *master_dev = NULL; + struct net_device *master_dev; struct ifinfomsg *ifm; char kind[MODULE_NAME_LEN]; char ifname[IFNAMSIZ]; @@ -2973,6 +2979,8 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, dev = NULL; } + master_dev = NULL; + m_ops = NULL; if (dev) { master_dev = netdev_master_upper_dev_get(dev); if (master_dev) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e1daab49b0eb0ad2ccbcdff22d3d93a2d5a8e093..c623c129d0ab6fce1f4685759baf02890b4cda82 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1977,7 +1977,7 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) /* Free pulled out fragments. */ while ((list = skb_shinfo(skb)->frag_list) != insp) { skb_shinfo(skb)->frag_list = list->next; - kfree_skb(list); + consume_skb(list); } /* And insert new clone at head. */ if (clone) { @@ -5482,7 +5482,7 @@ static int pskb_carve_frag_list(struct sk_buff *skb, /* Free pulled out fragments. */ while ((list = shinfo->frag_list) != insp) { shinfo->frag_list = list->next; - kfree_skb(list); + consume_skb(list); } /* And insert new clone at head. */ if (clone) { diff --git a/net/core/sock.c b/net/core/sock.c index 66a21528c7390661ae4ed448c22bf8ca07793b27..246abac5e8d25118256d8f8a5b0833106be4a059 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2229,9 +2229,6 @@ static void sk_leave_memory_pressure(struct sock *sk) } } -/* On 32bit arches, an skb frag is limited to 2^15 */ -#define SKB_FRAG_PAGE_ORDER get_order(32768) - /** * skb_page_frag_refill - check that a page_frag contains enough room * @sz: minimum size of the fragment we want to get diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 14e10214cf87628489e97508c763e3fd6a419461..86268f3f9bb9c3e5565ec8ee4e6ed56ba18cfd21 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1338,8 +1338,11 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, } ops = rcu_dereference(inet_offloads[proto]); - if (likely(ops && ops->callbacks.gso_segment)) + if (likely(ops && ops->callbacks.gso_segment)) { segs = ops->callbacks.gso_segment(skb, features); + if (!segs) + skb->network_header = skb_mac_header(skb) + nhoff - skb->head; + } if (IS_ERR_OR_NULL(segs)) goto out; @@ -1955,6 +1958,10 @@ static int __init inet_init(void) ip_init(); + /* Initialise per-cpu ipv4 mibs */ + if (init_ipv4_mibs()) + panic("%s: Cannot init ipv4 mibs\n", __func__); + /* Setup TCP slab cache for open requests. */ tcp_init(); @@ -1985,12 +1992,6 @@ static int __init inet_init(void) if (init_inet_pernet_ops()) pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__); - /* - * Initialise per-cpu ipv4 mibs - */ - - if (init_ipv4_mibs()) - pr_crit("%s: Cannot init ipv4 mibs\n", __func__); ipv4_proc_init(); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 0792a9e2a555fd1728227254a18d08656eac4596..45bb5b60b99409adc14d1828b9218dcc07bd89cb 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -275,6 +275,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; + unsigned int allocsz; /* this is non-NULL only with UDP Encapsulation */ if (x->encap) { @@ -284,6 +285,10 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * return err; } + allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); + if (allocsz > ESP_SKB_FRAG_MAXSIZE) + goto cow; + if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { nfrags = 1; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e63905f7f6f9584d0c3549218a5b8aa7e4f0d01e..1a6c059085847375068acfea5461850d8338af53 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -160,12 +160,19 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; - if (ip_dont_fragment(sk, &rt->dst)) { + /* Do not bother generating IPID for small packets (eg SYNACK) */ + if (skb->len <= IPV4_MIN_MTU || ip_dont_fragment(sk, &rt->dst)) { iph->frag_off = htons(IP_DF); iph->id = 0; } else { iph->frag_off = 0; - __ip_select_ident(net, iph, 1); + /* TCP packets here are SYNACK with fat IPv4/TCP options. + * Avoid using the hashed IP ident generator. + */ + if (sk->sk_protocol == IPPROTO_TCP) + iph->id = (__force __be16)prandom_u32(); + else + __ip_select_ident(net, iph, 1); } if (opt && opt->opt.optlen) { diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index d235478d9ca3cb8e900882b67845cb719b19998e..2085af224a4160ba25934b18d23c5ef5de028a7d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -265,7 +265,9 @@ static int __net_init ipmr_rules_init(struct net *net) return 0; err2: + rtnl_lock(); ipmr_free_table(mrt); + rtnl_unlock(); err1: fib_rules_unregister(ops); return err; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 2c11050d679bc9e7b8d5ed7a3c6130f392c3b5e9..08297d3504cf51c5491a05cd16af3bdc4fbfd5d0 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -509,8 +509,11 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) if (IS_ERR(config)) return PTR_ERR(config); } - } else if (memcmp(&config->clustermac, &cipinfo->clustermac, ETH_ALEN)) + } else if (memcmp(&config->clustermac, &cipinfo->clustermac, ETH_ALEN)) { + clusterip_config_entry_put(config); + clusterip_config_put(config); return -EINVAL; + } ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) { diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 862744c2854825cb7093755f07875154807ded3d..28d171e6e10b997e3e2fc3a703e849bdedde1dba 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -177,16 +177,23 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) struct sock *sk = NULL; struct inet_sock *isk; struct hlist_nulls_node *hnode; - int dif = skb->dev->ifindex; + int dif, sdif; if (skb->protocol == htons(ETH_P_IP)) { + dif = inet_iif(skb); + sdif = inet_sdif(skb); pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n", (int)ident, &ip_hdr(skb)->daddr, dif); #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6)) { + dif = inet6_iif(skb); + sdif = inet6_sdif(skb); pr_debug("try to find: num = %d, daddr = %pI6c, dif = %d\n", (int)ident, &ipv6_hdr(skb)->daddr, dif); #endif + } else { + pr_err("ping: protocol(%x) is not supported\n", ntohs(skb->protocol)); + return NULL; } read_lock_bh(&ping_table.lock); @@ -225,7 +232,8 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) continue; } - if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && + sk->sk_bound_dev_if != sdif) continue; sock_hold(sk); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index bc3da7796e8ae0e83dbd254d433bfa7ca34133f8..a24f94abe14f04d96cf2d74fdf588a556e6d9fd6 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -724,6 +724,7 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) int ret = -EINVAL; int chk_addr_ret; + lock_sock(sk); if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) goto out; @@ -743,7 +744,9 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_saddr = 0; /* Use device */ sk_dst_reset(sk); ret = 0; -out: return ret; +out: + release_sock(sk); + return ret; } /* diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5b410171a20baf612c365c303df352634de6862d..6d955ee09d1c9a3f288a2b76a1aeaac424ac7a12 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1674,11 +1674,13 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, if (!copied) copied = used; break; - } else if (used <= len) { - seq += used; - copied += used; - offset += used; } + if (WARN_ON_ONCE(used > len)) + used = len; + seq += used; + copied += used; + offset += used; + /* If recv_actor drops the lock (e.g. TCP splice * receive) the skb pointer might be invalid when * getting here: tcp_collapse might have deleted it diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 1e5e2e4be0b27142ad02716e9988a00c8840dbce..e85b5f57d3e92b05dcbabeeb787a53b8a66f017c 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -17,6 +17,7 @@ #include #include #include +#include static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, int tos, int oif, @@ -126,7 +127,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) fl4->flowi4_proto = iph->protocol; fl4->daddr = reverse ? iph->saddr : iph->daddr; fl4->saddr = reverse ? iph->daddr : iph->saddr; - fl4->flowi4_tos = iph->tos; + fl4->flowi4_tos = iph->tos & ~INET_ECN_MASK; if (!ip_is_fragment(iph)) { switch (iph->protocol) { diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 25317d5ccf2cc966b5328d8ccb161e1ffe718a1e..e3abfc97fde7fd1bff8e80cde0a9d86829f04f1a 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -241,6 +241,11 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; + unsigned int allocsz; + + allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); + if (allocsz > ESP_SKB_FRAG_MAXSIZE) + goto cow; if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e0e464b72c1fa4c936d8226fef7896eead4da72f..5ff67cb8b6ace353657c8d4a735e2d059f25dda4 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -112,7 +112,7 @@ void fib6_update_sernum(struct net *net, struct fib6_info *f6i) fn = rcu_dereference_protected(f6i->fib6_node, lockdep_is_held(&f6i->fib6_table->tb6_lock)); if (fn) - fn->fn_sernum = fib6_new_sernum(net); + WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net)); } /* @@ -544,12 +544,13 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; - cb->args[5] = w->root->fn_sernum; + cb->args[5] = READ_ONCE(w->root->fn_sernum); } } else { - if (cb->args[5] != w->root->fn_sernum) { + int sernum = READ_ONCE(w->root->fn_sernum); + if (cb->args[5] != sernum) { /* Begin at the root if the tree changed */ - cb->args[5] = w->root->fn_sernum; + cb->args[5] = sernum; w->state = FWS_INIT; w->node = w->root; w->skip = w->count; @@ -1203,7 +1204,7 @@ static void __fib6_update_sernum_upto_root(struct fib6_info *rt, /* paired with smp_rmb() in rt6_get_cookie_safe() */ smp_wmb(); while (fn) { - fn->fn_sernum = sernum; + WRITE_ONCE(fn->fn_sernum, sernum); fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&rt->fib6_table->tb6_lock)); } @@ -1983,8 +1984,8 @@ static int fib6_clean_node(struct fib6_walker *w) }; if (c->sernum != FIB6_NO_SERNUM_CHANGE && - w->node->fn_sernum != c->sernum) - w->node->fn_sernum = c->sernum; + READ_ONCE(w->node->fn_sernum) != c->sernum) + WRITE_ONCE(w->node->fn_sernum, c->sernum); if (!c->func) { WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE); @@ -2332,7 +2333,7 @@ static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter, iter->w.state = FWS_INIT; iter->w.node = iter->w.root; iter->w.args = iter; - iter->sernum = iter->w.root->fn_sernum; + iter->sernum = READ_ONCE(iter->w.root->fn_sernum); INIT_LIST_HEAD(&iter->w.lh); fib6_walker_link(net, &iter->w); } @@ -2360,8 +2361,10 @@ static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) { - if (iter->sernum != iter->w.root->fn_sernum) { - iter->sernum = iter->w.root->fn_sernum; + int sernum = READ_ONCE(iter->w.root->fn_sernum); + + if (iter->sernum != sernum) { + iter->sernum = sernum; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; WARN_ON(iter->w.skip); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index c7e495f1201105f1ac1724a7b8fd82399efcce32..6c47cd0ef2401e9e0105edf5f8fe3d1a3b97bbeb 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -98,6 +98,8 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, if (likely(ops && ops->callbacks.gso_segment)) { skb_reset_transport_header(skb); segs = ops->callbacks.gso_segment(skb, features); + if (!segs) + skb->network_header = skb_mac_header(skb) + nhoff - skb->head; } if (IS_ERR_OR_NULL(segs)) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 251ec12517e9397b79940d427f73b8f25341b557..373df391dc93cabc9ed71aef51b14dd755bf15a5 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1259,8 +1259,6 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, if (np->frag_size) mtu = np->frag_size; } - if (mtu < IPV6_MIN_MTU) - return -EINVAL; cork->base.fragsize = mtu; cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; @@ -1320,8 +1318,6 @@ static int __ip6_append_data(struct sock *sk, fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + (opt ? opt->opt_nflen : 0); - maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - - sizeof(struct frag_hdr); headersize = sizeof(struct ipv6hdr) + (opt ? opt->opt_flen + opt->opt_nflen : 0) + @@ -1329,6 +1325,13 @@ static int __ip6_append_data(struct sock *sk, sizeof(struct frag_hdr) : 0) + rt->rt6i_nfheader_len; + if (mtu <= fragheaderlen || + ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr)) + goto emsgsize; + + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - + sizeof(struct frag_hdr); + /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit * the first fragment */ diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 35c127c3eee78b35d00b6cb28c796a55ad1147bb..b647a4037679540306ceff718cf933ee8f8e8137 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1005,14 +1005,14 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t, if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) - pr_warn("%s xmit: Local address not yet configured!\n", - p->name); + pr_warn_ratelimited("%s xmit: Local address not yet configured!\n", + p->name); else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) && !ipv6_addr_is_multicast(raddr) && unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE))) - pr_warn("%s xmit: Routing loop! Remote address found on this node!\n", - p->name); + pr_warn_ratelimited("%s xmit: Routing loop! Remote address found on this node!\n", + p->name); else ret = 1; rcu_read_unlock(); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 290badfe70e06e61c0368bfc050cb2cefea1308e..866ce815625e553bd51b74ccd80155b7cd68c0a6 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -799,6 +799,8 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); + memset(&p1, 0, sizeof(p1)); + switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 9a8923e6a8fafa7cd4a05dd3a6bc907edb0e39a2..d392a72cd784e00b38302a6e6846516895e7e09d 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -251,7 +251,9 @@ static int __net_init ip6mr_rules_init(struct net *net) return 0; err2: + rtnl_lock(); ip6mr_free_table(mrt); + rtnl_unlock(); err1: fib_rules_unregister(ops); return err; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 01f8e62302fabfdc200735026eb2bc8b1cdbe589..8efb03cc6f242a8123b96eea7a132c1ff83e251a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2320,7 +2320,7 @@ static void ip6_link_failure(struct sk_buff *skb) if (from) { fn = rcu_dereference(from->fib6_node); if (fn && (rt->rt6i_flags & RTF_DEFAULT)) - fn->fn_sernum = -1; + WRITE_ONCE(fn->fn_sernum, -1); } } rcu_read_unlock(); @@ -4424,6 +4424,19 @@ static void ip6_route_mpath_notify(struct fib6_info *rt, inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); } +static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla, + struct netlink_ext_ack *extack) +{ + if (nla_len(nla) < sizeof(*gw)) { + NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY"); + return -EINVAL; + } + + *gw = nla_get_in6_addr(nla); + + return 0; +} + static int ip6_route_multipath_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) { @@ -4464,7 +4477,11 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { - r_cfg.fc_gateway = nla_get_in6_addr(nla); + err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, + extack); + if (err) + goto cleanup; + r_cfg.fc_flags |= RTF_GATEWAY; } r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); @@ -4598,7 +4615,13 @@ static int ip6_route_multipath_del(struct fib6_config *cfg, nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { - nla_memcpy(&r_cfg.fc_gateway, nla, 16); + err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, + extack); + if (err) { + last_err = err; + goto next_rtnh; + } + r_cfg.fc_flags |= RTF_GATEWAY; } } @@ -4606,6 +4629,7 @@ static int ip6_route_multipath_del(struct fib6_config *cfg, if (err) last_err = err; +next_rtnh: rtnh = rtnh_next(rtnh, &remaining); } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 581763d6df12d07a245d122e0a023aadc166a2f2..3ed5fe0055aff37dcc1e1476d9f139ca7ca16f7b 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1069,7 +1069,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, kfree_skb(skb); return -EINVAL; } - if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) { + if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) { kfree_skb(skb); return -EINVAL; } diff --git a/net/key/af_key.c b/net/key/af_key.c index c7d5a6015389b81972225ce420e75e93dbce8039..8ce513e08fd797f51d3cc440b3ea40ca22a520e3 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1709,7 +1709,7 @@ static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sad xfrm_probe_algs(); - supp_skb = compose_sadb_supported(hdr, GFP_KERNEL); + supp_skb = compose_sadb_supported(hdr, GFP_KERNEL | __GFP_ZERO); if (!supp_skb) { if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) pfk->registered &= ~(1<sadb_msg_satype); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index fa0f3c1543ba586b8a06ecdcabdb02ad60d7ec27..d23cd51efc5ffbe9f37feb2ba0532d8ffe10b491 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -268,6 +268,7 @@ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); + struct net_device *dev = NULL; struct llc_sap *sap; int rc = -EINVAL; @@ -279,14 +280,14 @@ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr) goto out; rc = -ENODEV; if (sk->sk_bound_dev_if) { - llc->dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if); - if (llc->dev && addr->sllc_arphrd != llc->dev->type) { - dev_put(llc->dev); - llc->dev = NULL; + dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if); + if (dev && addr->sllc_arphrd != dev->type) { + dev_put(dev); + dev = NULL; } } else - llc->dev = dev_getfirstbyhwtype(&init_net, addr->sllc_arphrd); - if (!llc->dev) + dev = dev_getfirstbyhwtype(&init_net, addr->sllc_arphrd); + if (!dev) goto out; rc = -EUSERS; llc->laddr.lsap = llc_ui_autoport(); @@ -296,6 +297,11 @@ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr) sap = llc_sap_open(llc->laddr.lsap, NULL); if (!sap) goto out; + + /* Note: We do not expect errors from this point. */ + llc->dev = dev; + dev = NULL; + memcpy(llc->laddr.mac, llc->dev->dev_addr, IFHWADDRLEN); memcpy(&llc->addr, addr, sizeof(llc->addr)); /* assign new connection to its SAP */ @@ -303,6 +309,7 @@ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr) sock_reset_flag(sk, SOCK_ZAPPED); rc = 0; out: + dev_put(dev); return rc; } @@ -325,6 +332,7 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); + struct net_device *dev = NULL; struct llc_sap *sap; int rc = -EINVAL; @@ -340,25 +348,26 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) rc = -ENODEV; rcu_read_lock(); if (sk->sk_bound_dev_if) { - llc->dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if); - if (llc->dev) { + dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if); + if (dev) { if (is_zero_ether_addr(addr->sllc_mac)) - memcpy(addr->sllc_mac, llc->dev->dev_addr, + memcpy(addr->sllc_mac, dev->dev_addr, IFHWADDRLEN); - if (addr->sllc_arphrd != llc->dev->type || + if (addr->sllc_arphrd != dev->type || !ether_addr_equal(addr->sllc_mac, - llc->dev->dev_addr)) { + dev->dev_addr)) { rc = -EINVAL; - llc->dev = NULL; + dev = NULL; } } - } else - llc->dev = dev_getbyhwaddr_rcu(&init_net, addr->sllc_arphrd, + } else { + dev = dev_getbyhwaddr_rcu(&init_net, addr->sllc_arphrd, addr->sllc_mac); - if (llc->dev) - dev_hold(llc->dev); + } + if (dev) + dev_hold(dev); rcu_read_unlock(); - if (!llc->dev) + if (!dev) goto out; if (!addr->sllc_sap) { rc = -EUSERS; @@ -391,6 +400,11 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) goto out_put; } } + + /* Note: We do not expect errors from this point. */ + llc->dev = dev; + dev = NULL; + llc->laddr.lsap = addr->sllc_sap; memcpy(llc->laddr.mac, addr->sllc_mac, IFHWADDRLEN); memcpy(&llc->addr, addr, sizeof(llc->addr)); @@ -401,6 +415,7 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) out_put: llc_sap_put(sap); out: + dev_put(dev); release_sock(sk); return rc; } diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 3f0bdc728f891f8f277bad9236fb345fe427cbb1..ada3163b3fa3be1a5929156abb756144c4783532 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -335,14 +335,15 @@ static int __nf_register_net_hook(struct net *net, int pf, p = nf_entry_dereference(*pp); new_hooks = nf_hook_entries_grow(p, reg); - if (!IS_ERR(new_hooks)) + if (!IS_ERR(new_hooks)) { + hooks_validate(new_hooks); rcu_assign_pointer(*pp, new_hooks); + } mutex_unlock(&nf_hook_mutex); if (IS_ERR(new_hooks)) return PTR_ERR(new_hooks); - hooks_validate(new_hooks); #ifdef CONFIG_NETFILTER_INGRESS if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) net_inc_ingress_queue(); diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index ee6d98355081ee2651b60fce7eb3b778cfd15d0a..84c59de2788222b4532fd3f602399726f064c97f 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -46,6 +46,15 @@ void nf_unregister_queue_handler(struct net *net) } EXPORT_SYMBOL(nf_unregister_queue_handler); +static void nf_queue_sock_put(struct sock *sk) +{ +#ifdef CONFIG_INET + sock_gen_put(sk); +#else + sock_put(sk); +#endif +} + void nf_queue_entry_release_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; @@ -56,7 +65,7 @@ void nf_queue_entry_release_refs(struct nf_queue_entry *entry) if (state->out) dev_put(state->out); if (state->sk) - sock_put(state->sk); + nf_queue_sock_put(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (entry->skb->nf_bridge) { struct net_device *physdev; @@ -73,10 +82,13 @@ void nf_queue_entry_release_refs(struct nf_queue_entry *entry) EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs); /* Bump dev refs so they don't vanish while packet is out */ -void nf_queue_entry_get_refs(struct nf_queue_entry *entry) +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; + if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt)) + return false; + if (state->in) dev_hold(state->in); if (state->out) @@ -95,6 +107,7 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry) dev_hold(physdev); } #endif + return true; } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); @@ -186,7 +199,10 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, .size = sizeof(*entry) + route_key_size, }; - nf_queue_entry_get_refs(entry); + if (!nf_queue_entry_get_refs(entry)) { + kfree(entry); + return -ENOTCONN; + } switch (entry->state.pf) { case AF_INET: diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index a3850414dba2d6cc52c8f5fbaf05e4fcf508900a..7dfaad783cd55ff6ffd7fe919bb7a9fb924feb9a 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -144,7 +144,7 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv) struct nft_rule *const *rules; const struct nft_rule *rule; const struct nft_expr *expr, *last; - struct nft_regs regs; + struct nft_regs regs = {}; unsigned int stackptr = 0; struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; bool genbit = READ_ONCE(net->nft.gencursor); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 8955431f2ab26553a54e7fd70f97c2c0454c4d78..a5aff2834bd6c4d95fa44b4d2f96b9e1a827f2de 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -716,9 +716,15 @@ static struct nf_queue_entry * nf_queue_entry_dup(struct nf_queue_entry *e) { struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); - if (entry) - nf_queue_entry_get_refs(entry); - return entry; + + if (!entry) + return NULL; + + if (nf_queue_entry_get_refs(entry)) + return entry; + + kfree(entry); + return NULL; } #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index b1a9f330a51fe26b73b7d9c7f1d8a5cfa4b65435..fd87216bc0a99ef2a01a5fa22f4490001c6df9c3 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -194,6 +194,9 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, struct sk_buff *skb, unsigned int *l4csum_offset) { + if (pkt->xt.fragoff) + return -1; + switch (pkt->tprot) { case IPPROTO_TCP: *l4csum_offset = offsetof(struct tcphdr, check); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index fbf7d5ef83c711178db7fd2ad1be9c7799547d6d..ddf0d081b5bc60a1a57973503ca03d251e57b1c7 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1729,7 +1729,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) err = -ENOSPC; if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) { __dev_remove_pack(&po->prot_hook); - po->fanout = match; + + /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */ + WRITE_ONCE(po->fanout, match); + po->rollover = rollover; rollover = NULL; refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1); @@ -2238,8 +2241,11 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, copy_skb = skb_get(skb); skb_head = skb->data; } - if (copy_skb) + if (copy_skb) { + memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0, + sizeof(PACKET_SKB_CB(copy_skb)->sa.ll)); skb_set_owner_r(copy_skb, sk); + } } snaplen = po->rx_ring.frame_size - macoff; if ((int)snaplen < 0) { @@ -3397,6 +3403,8 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, sock_recv_ts_and_drops(msg, sk, skb); if (msg->msg_name) { + const size_t max_len = min(sizeof(skb->cb), + sizeof(struct sockaddr_storage)); int copy_len; /* If the address length field is there to be filled @@ -3419,6 +3427,10 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, msg->msg_namelen = sizeof(struct sockaddr_ll); } } + if (WARN_ON_ONCE(copy_len > max_len)) { + copy_len = max_len; + msg->msg_namelen = copy_len; + } memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len); } @@ -3871,7 +3883,8 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv } case PACKET_FANOUT_DATA: { - if (!po->fanout) + /* Paired with the WRITE_ONCE() in fanout_add() */ + if (!READ_ONCE(po->fanout)) return -EINVAL; return fanout_set_data(po, optval, optlen); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index f2c4bfc7966331bfca17c1656d08d70e7b22b397..26851ca5566aaf65c7a09b21a0103b30c8f02ecd 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -609,15 +609,24 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, restart_act_graph: for (i = 0; i < nr_actions; i++) { const struct tc_action *a = actions[i]; + int repeat_ttl; if (jmp_prgcnt > 0) { jmp_prgcnt -= 1; continue; } + + repeat_ttl = 32; repeat: ret = a->ops->act(skb, a, res); - if (ret == TC_ACT_REPEAT) - goto repeat; /* we need a ttl - JHS */ + + if (unlikely(ret == TC_ACT_REPEAT)) { + if (--repeat_ttl != 0) + goto repeat; + /* suspicious opcode, stop pipeline */ + net_warn_ratelimited("TC_ACT_REPEAT abuse ?\n"); + return TC_ACT_OK; + } if (TC_ACT_EXT_CMP(ret, TC_ACT_JUMP)) { jmp_prgcnt = ret & TCA_ACT_MAX_PRIO_MASK; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 4413aa8d4e82982e23083787ede98912f0cd4494..3ca18e3a2be93ea174c2b8d293c808d4c0d00eeb 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -539,6 +539,7 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, struct netlink_ext_ack *extack) { struct tcf_block *block; + int err = 0; if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) { block = tcf_block_lookup(net, block_index); @@ -550,55 +551,95 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, const struct Qdisc_class_ops *cops; struct net_device *dev; + rcu_read_lock(); + /* Find link */ - dev = __dev_get_by_index(net, ifindex); - if (!dev) + dev = dev_get_by_index_rcu(net, ifindex); + if (!dev) { + rcu_read_unlock(); return ERR_PTR(-ENODEV); + } /* Find qdisc */ if (!*parent) { *q = dev->qdisc; *parent = (*q)->handle; } else { - *q = qdisc_lookup(dev, TC_H_MAJ(*parent)); + *q = qdisc_lookup_rcu(dev, TC_H_MAJ(*parent)); if (!*q) { NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); - return ERR_PTR(-EINVAL); + err = -EINVAL; + goto errout_rcu; } } + *q = qdisc_refcount_inc_nz(*q); + if (!*q) { + NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); + err = -EINVAL; + goto errout_rcu; + } + /* Is it classful? */ cops = (*q)->ops->cl_ops; if (!cops) { NL_SET_ERR_MSG(extack, "Qdisc not classful"); - return ERR_PTR(-EINVAL); + err = -EINVAL; + goto errout_rcu; } if (!cops->tcf_block) { NL_SET_ERR_MSG(extack, "Class doesn't support blocks"); - return ERR_PTR(-EOPNOTSUPP); + err = -EOPNOTSUPP; + goto errout_rcu; } + /* At this point we know that qdisc is not noop_qdisc, + * which means that qdisc holds a reference to net_device + * and we hold a reference to qdisc, so it is safe to release + * rcu read lock. + */ + rcu_read_unlock(); + /* Do we search for filter, attached to class? */ if (TC_H_MIN(*parent)) { *cl = cops->find(*q, *parent); if (*cl == 0) { NL_SET_ERR_MSG(extack, "Specified class doesn't exist"); - return ERR_PTR(-ENOENT); + err = -ENOENT; + goto errout_qdisc; } } /* And the last stroke */ block = cops->tcf_block(*q, *cl, extack); - if (!block) - return ERR_PTR(-EINVAL); + if (!block) { + err = -EINVAL; + goto errout_qdisc; + } if (tcf_block_shared(block)) { NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters"); - return ERR_PTR(-EOPNOTSUPP); + err = -EOPNOTSUPP; + goto errout_qdisc; } } return block; + +errout_rcu: + rcu_read_unlock(); +errout_qdisc: + if (*q) { + qdisc_destroy(*q); + *q = NULL; + } + return ERR_PTR(err); +} + +static void tcf_block_release(struct Qdisc *q, struct tcf_block *block) +{ + if (q) + qdisc_destroy(q); } struct tcf_block_owner_item { @@ -1336,6 +1377,7 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, errout: if (chain) tcf_chain_put(chain); + tcf_block_release(q, block); if (err == -EAGAIN) /* Replay the request. */ goto replay; @@ -1457,6 +1499,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, errout: if (chain) tcf_chain_put(chain); + tcf_block_release(q, block); return err; } @@ -1542,6 +1585,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, errout: if (chain) tcf_chain_put(chain); + tcf_block_release(q, block); return err; } @@ -1858,7 +1902,8 @@ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; if (chain_index > TC_ACT_EXT_VAL_MASK) { NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); - return -EINVAL; + err = -EINVAL; + goto errout_block; } chain = tcf_chain_lookup(block, chain_index); if (n->nlmsg_type == RTM_NEWCHAIN) { @@ -1870,23 +1915,27 @@ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, tcf_chain_hold(chain); } else { NL_SET_ERR_MSG(extack, "Filter chain already exists"); - return -EEXIST; + err = -EEXIST; + goto errout_block; } } else { if (!(n->nlmsg_flags & NLM_F_CREATE)) { NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain"); - return -ENOENT; + err = -ENOENT; + goto errout_block; } chain = tcf_chain_create(block, chain_index); if (!chain) { NL_SET_ERR_MSG(extack, "Failed to create filter chain"); - return -ENOMEM; + err = -ENOMEM; + goto errout_block; } } } else { if (!chain || tcf_chain_held_by_acts_only(chain)) { NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); - return -EINVAL; + err = -EINVAL; + goto errout_block; } tcf_chain_hold(chain); } @@ -1930,6 +1979,8 @@ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, errout: tcf_chain_put(chain); +errout_block: + tcf_block_release(q, block); if (err == -EAGAIN) /* Replay the request. */ goto replay; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 2c9aa550763ddfa0c03ec9ea20ca6bbbba56ac2f..19c390ab2b7c936e144fb3af0163fc2f89f552ff 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -315,6 +315,24 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) return q; } +struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle) +{ + struct netdev_queue *nq; + struct Qdisc *q; + + if (!handle) + return NULL; + q = qdisc_match_from_root(dev->qdisc, handle); + if (q) + goto out; + + nq = dev_ingress_queue_rcu(dev); + if (nq) + q = qdisc_match_from_root(nq->qdisc_sleeping, handle); +out: + return q; +} + static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) { unsigned long cl; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index c2276a3c9dd8db1a7310b5840a92795c176f6764..ef41320a9e96047adec951d9e30ff3662030d8af 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -953,6 +953,13 @@ void qdisc_free(struct Qdisc *qdisc) kfree((char *) qdisc - qdisc->padded); } +void qdisc_free_cb(struct rcu_head *head) +{ + struct Qdisc *q = container_of(head, struct Qdisc, rcu); + + qdisc_free(q); +} + void qdisc_destroy(struct Qdisc *qdisc) { const struct Qdisc_ops *ops; @@ -990,7 +997,7 @@ void qdisc_destroy(struct Qdisc *qdisc) kfree_skb_list(skb); } - qdisc_free(qdisc); + call_rcu(&qdisc->rcu, qdisc_free_cb); } EXPORT_SYMBOL(qdisc_destroy); diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index 73dcda060335fc1a52105b399a65f20bf12f4d6e..60fb9529c6ad41a13ce3b16357256e31232bcea9 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -111,6 +111,7 @@ static int gssp_rpc_create(struct net *net, struct rpc_clnt **_clnt) * done without the correct namespace: */ .flags = RPC_CLNT_CREATE_NOPING | + RPC_CLNT_CREATE_CONNECTED | RPC_CLNT_CREATE_NO_IDLE_TIMEOUT }; struct rpc_clnt *clnt; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 9ac94c774335e2a8316e01ab0f0ffbee21964a39..dc58c227f37c395f5650e7d9d445e474b4ae454a 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -79,6 +79,7 @@ static void call_connect_status(struct rpc_task *task); static __be32 *rpc_encode_header(struct rpc_task *task); static __be32 *rpc_verify_header(struct rpc_task *task); static int rpc_ping(struct rpc_clnt *clnt); +static int rpc_ping_noreply(struct rpc_clnt *clnt); static void rpc_register_client(struct rpc_clnt *clnt) { @@ -481,6 +482,12 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args, rpc_shutdown_client(clnt); return ERR_PTR(err); } + } else if (args->flags & RPC_CLNT_CREATE_CONNECTED) { + int err = rpc_ping_noreply(clnt); + if (err != 0) { + rpc_shutdown_client(clnt); + return ERR_PTR(err); + } } clnt->cl_softrtry = 1; @@ -2545,6 +2552,10 @@ static const struct rpc_procinfo rpcproc_null = { .p_decode = rpcproc_decode_null, }; +static const struct rpc_procinfo rpcproc_null_noreply = { + .p_encode = rpcproc_encode_null, +}; + static int rpc_ping(struct rpc_clnt *clnt) { struct rpc_message msg = { @@ -2557,6 +2568,32 @@ static int rpc_ping(struct rpc_clnt *clnt) return err; } +static int rpc_ping_noreply(struct rpc_clnt *clnt) +{ + struct rpc_message msg = { + .rpc_proc = &rpcproc_null_noreply, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = &rpc_default_ops, + .flags = RPC_TASK_SOFT | RPC_TASK_SOFTCONN, + }; + struct rpc_task *task; + int status; + + msg.rpc_cred = authnull_ops.lookup_cred(NULL, NULL, 0); + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + put_rpccred(msg.rpc_cred); + return PTR_ERR(task); + } + status = task->tk_status; + rpc_put_task(task); + put_rpccred(msg.rpc_cred); + return status; +} + static struct rpc_task *rpc_call_null_helper(struct rpc_clnt *clnt, struct rpc_xprt *xprt, struct rpc_cred *cred, int flags, diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index f93d386ae9232cbdb380ba21545355e9bfcef671..38598edcd86e5361bec09c70e7011454f127e2c9 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2978,9 +2978,6 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) } xprt_set_bound(xprt); xs_format_peer_addresses(xprt, "local", RPCBIND_NETID_LOCAL); - ret = ERR_PTR(xs_local_setup_socket(transport)); - if (ret) - goto out_err; break; default: ret = ERR_PTR(-EAFNOSUPPORT); diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 8bbe1b8e4ff7f3da19a1809dc75b6fac8d23e1f1..4d283e26d81621d32fd21e33127d0f0e53233e00 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -197,8 +197,11 @@ void wait_for_unix_gc(void) { /* If number of inflight sockets is insane, * force a garbage collect right now. + * Paired with the WRITE_ONCE() in unix_inflight(), + * unix_notinflight() and gc_in_progress(). */ - if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress) + if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && + !READ_ONCE(gc_in_progress)) unix_gc(); wait_event(unix_gc_wait, gc_in_progress == false); } @@ -218,7 +221,9 @@ void unix_gc(void) if (gc_in_progress) goto out; - gc_in_progress = true; + /* Paired with READ_ONCE() in wait_for_unix_gc(). */ + WRITE_ONCE(gc_in_progress, true); + /* First, select candidates for garbage collection. Only * in-flight sockets are considered, and from those only ones * which don't have any external reference. @@ -304,7 +309,10 @@ void unix_gc(void) /* All candidates should have been detached by now. */ BUG_ON(!list_empty(&gc_candidates)); - gc_in_progress = false; + + /* Paired with READ_ONCE() in wait_for_unix_gc(). */ + WRITE_ONCE(gc_in_progress, false); + wake_up(&unix_gc_wait); out: diff --git a/net/unix/scm.c b/net/unix/scm.c index 8c40f2b323924d304111b7b8981e9bb209722d5b..ce700b22eccee4ea8e962268d0b8ef0b05fe1d61 100644 --- a/net/unix/scm.c +++ b/net/unix/scm.c @@ -59,7 +59,8 @@ void unix_inflight(struct user_struct *user, struct file *fp) } else { BUG_ON(list_empty(&u->link)); } - unix_tot_inflight++; + /* Paired with READ_ONCE() in wait_for_unix_gc() */ + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); } user->unix_inflight++; spin_unlock(&unix_gc_lock); @@ -79,7 +80,8 @@ void unix_notinflight(struct user_struct *user, struct file *fp) if (atomic_long_dec_and_test(&u->inflight)) list_del_init(&u->link); - unix_tot_inflight--; + /* Paired with READ_ONCE() in wait_for_unix_gc() */ + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); } user->unix_inflight--; spin_unlock(&unix_gc_lock); diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index e7a0ce98479f31692b676f395a011b6843321d06..8a9f02997067e618a5d084c21737fc3a17249a3d 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -153,6 +153,9 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, if (x->encap || x->tfcpad) return -EINVAL; + if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND)) + return -EINVAL; + dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) { @@ -190,7 +193,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, xso->dev = dev; xso->num_exthdrs = 1; - xso->flags = xuo->flags; + /* Don't forward bit that is not implemented */ + xso->flags = xuo->flags & ~XFRM_OFFLOAD_IPV6; err = dev->xfrmdev_ops->xdo_dev_state_add(x); if (err) { diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 9dd43319b2c06854f054f9ecf7df519d3c6318e3..18d27ce2660080db43447ac0011480288edca04c 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -662,11 +662,16 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev, struct netlink_ext_ack *extack) { struct net *net = dev_net(dev); - struct xfrm_if_parms p; + struct xfrm_if_parms p = {}; struct xfrm_if *xi; int err; xfrmi_netlink_parms(data, &p); + if (!p.if_id) { + NL_SET_ERR_MSG(extack, "if_id must be non zero"); + return -EINVAL; + } + xi = xfrmi_locate(net, &p); if (xi) return -EEXIST; @@ -691,9 +696,14 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], { struct xfrm_if *xi = netdev_priv(dev); struct net *net = xi->net; - struct xfrm_if_parms p; + struct xfrm_if_parms p = {}; xfrmi_netlink_parms(data, &p); + if (!p.if_id) { + NL_SET_ERR_MSG(extack, "if_id must be non zero"); + return -EINVAL; + } + xi = xfrmi_locate(net, &p); if (!xi) { xi = netdev_priv(dev); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 84dea0ad16661888412b0354d0d10870d9b314f8..bc7e982a86d1c9af8c83e1869448b9980741b1b7 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1443,9 +1443,6 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, memcpy(&x->mark, &orig->mark, sizeof(x->mark)); memcpy(&x->props.smark, &orig->props.smark, sizeof(x->props.smark)); - if (xfrm_init_state(x) < 0) - goto error; - x->props.flags = orig->props.flags; x->props.extra_flags = orig->props.extra_flags; @@ -1524,6 +1521,11 @@ struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, if (!xc) return NULL; + xc->props.family = m->new_family; + + if (xfrm_init_state(xc) < 0) + goto error; + memcpy(&xc->id.daddr, &m->new_daddr, sizeof(xc->id.daddr)); memcpy(&xc->props.saddr, &m->new_saddr, sizeof(xc->props.saddr)); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index f94abe1fdd58f3700911baa5913a161b1dcf02a9..87932f6ad9d75b8eb86d3b6e255d5f7e21c23627 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2813,7 +2813,7 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x) if (x->props.extra_flags) l += nla_total_size(sizeof(x->props.extra_flags)); if (x->xso.dev) - l += nla_total_size(sizeof(x->xso)); + l += nla_total_size(sizeof(struct xfrm_user_offload)); if (x->props.smark.v | x->props.smark.m) { l += nla_total_size(sizeof(x->props.smark.v)); l += nla_total_size(sizeof(x->props.smark.m)); diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index c0b08ac32dcf48d34c26b6ec0a1fec1a94cba3a7..47f878b1399017f32911c9d819f8d803864c49c2 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -190,6 +190,12 @@ static enum integrity_status evm_verify_hmac(struct dentry *dentry, break; case EVM_IMA_XATTR_DIGSIG: case EVM_XATTR_PORTABLE_DIGSIG: + /* accept xattr with non-empty signature field */ + if (xattr_len <= sizeof(struct signature_v2_hdr)) { + evm_status = INTEGRITY_FAIL; + goto out; + } + hdr = (struct signature_v2_hdr *)xattr_data; digest.hdr.algo = hdr->hash_algo; rc = evm_calc_hash(dentry, xattr_name, xattr_value, diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 8d58874a8caa464b460141f823b89255b14682d3..2619dee3b4a566ba406a5c32ce2f624c2680580c 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -56,6 +56,7 @@ extern int ima_policy_flag; extern int ima_hash_algo; extern int ima_appraise; extern struct tpm_chip *ima_tpm_chip; +extern const char boot_aggregate_name[]; /* IMA event related data */ struct ima_event_data { @@ -139,7 +140,7 @@ int ima_calc_buffer_hash(const void *buf, loff_t len, int ima_calc_field_array_hash(struct ima_field_data *field_data, struct ima_template_desc *desc, int num_fields, struct ima_digest_data *hash); -int __init ima_calc_boot_aggregate(struct ima_digest_data *hash); +int ima_calc_boot_aggregate(struct ima_digest_data *hash); void ima_add_violation(struct file *file, const unsigned char *filename, struct integrity_iint_cache *iint, const char *op, const char *cause); diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index d9f4b4c845193cef05657962892a06307bd2cfbb..cda1697c8fda6193bd287def4de34b0433b90a1c 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -415,7 +415,7 @@ int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash) loff_t i_size; int rc; struct file *f = file; - bool new_file_instance = false, modified_flags = false; + bool new_file_instance = false; /* * For consistency, fail file's opened with the O_DIRECT flag on @@ -433,18 +433,10 @@ int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash) O_TRUNC | O_CREAT | O_NOCTTY | O_EXCL); flags |= O_RDONLY; f = dentry_open(&file->f_path, flags, file->f_cred); - if (IS_ERR(f)) { - /* - * Cannot open the file again, lets modify f_flags - * of original and continue - */ - pr_info_ratelimited("Unable to reopen file for reading.\n"); - f = file; - f->f_flags |= FMODE_READ; - modified_flags = true; - } else { - new_file_instance = true; - } + if (IS_ERR(f)) + return PTR_ERR(f); + + new_file_instance = true; } i_size = i_size_read(file_inode(f)); @@ -459,8 +451,6 @@ int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash) out: if (new_file_instance) fput(f); - else if (modified_flags) - f->f_flags &= ~FMODE_READ; return rc; } @@ -651,7 +641,7 @@ int ima_calc_buffer_hash(const void *buf, loff_t len, return calc_buffer_shash(buf, len, hash); } -static void __init ima_pcrread(int idx, u8 *pcr) +static void ima_pcrread(int idx, u8 *pcr) { if (!ima_tpm_chip) return; @@ -663,8 +653,8 @@ static void __init ima_pcrread(int idx, u8 *pcr) /* * Calculate the boot aggregate hash */ -static int __init ima_calc_boot_aggregate_tfm(char *digest, - struct crypto_shash *tfm) +static int ima_calc_boot_aggregate_tfm(char *digest, + struct crypto_shash *tfm) { /*to be compatible with tpm1.2 ,make sure the content is not arbitrary*/ u8 pcr_i[IMA_DIGEST_SIZE] = {0}; @@ -683,13 +673,15 @@ static int __init ima_calc_boot_aggregate_tfm(char *digest, ima_pcrread(i, pcr_i); /* now accumulate with current aggregate */ rc = crypto_shash_update(shash, pcr_i, TPM_DIGEST_SIZE); + if (rc != 0) + return rc; } if (!rc) crypto_shash_final(shash, digest); return rc; } -int __init ima_calc_boot_aggregate(struct ima_digest_data *hash) +int ima_calc_boot_aggregate(struct ima_digest_data *hash) { struct crypto_shash *tfm; int rc; diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c index cfb8cc3b975e3614f00e132901601397d9dc6ecd..38bd565b9da9b5bf19ebb0d5d6aa517a0860baad 100644 --- a/security/integrity/ima/ima_fs.c +++ b/security/integrity/ima/ima_fs.c @@ -343,8 +343,7 @@ static ssize_t ima_write_policy(struct file *file, const char __user *buf, integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL, NULL, "policy_update", "signed policy required", 1, 0); - if (ima_appraise & IMA_APPRAISE_ENFORCE) - result = -EACCES; + result = -EACCES; } else { result = ima_parse_add_rule(data); } @@ -498,12 +497,12 @@ int __init ima_fs_init(void) return 0; out: + securityfs_remove(ima_policy); securityfs_remove(violations); securityfs_remove(runtime_measurements_count); securityfs_remove(ascii_runtime_measurements); securityfs_remove(binary_runtime_measurements); securityfs_remove(ima_symlink); securityfs_remove(ima_dir); - securityfs_remove(ima_policy); return -1; } diff --git a/security/integrity/ima/ima_init.c b/security/integrity/ima/ima_init.c index faac9ecaa0aeff26a4a83acb593b79ab4a5c91d7..a2bc4cb4482aab3f35cc522089b5d72b3d887e9a 100644 --- a/security/integrity/ima/ima_init.c +++ b/security/integrity/ima/ima_init.c @@ -25,7 +25,7 @@ #include "ima.h" /* name for boot aggregate entry */ -static const char *boot_aggregate_name = "boot_aggregate"; +const char boot_aggregate_name[] = "boot_aggregate"; struct tpm_chip *ima_tpm_chip; /* Add the boot aggregate to the IMA measurement list and extend diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c index 30db39b2380432b1770f0f03fd84a75d2dfb9cb1..4dfdccce497b6a1f2805fd7e57b75ab211b21a76 100644 --- a/security/integrity/ima/ima_template.c +++ b/security/integrity/ima/ima_template.c @@ -31,6 +31,7 @@ static struct ima_template_desc builtin_templates[] = { static LIST_HEAD(defined_templates); static DEFINE_SPINLOCK(template_list); +static int template_setup_done; static struct ima_template_field supported_fields[] = { {.field_id = "d", .field_init = ima_eventdigest_init, @@ -57,10 +58,11 @@ static int __init ima_template_setup(char *str) struct ima_template_desc *template_desc; int template_len = strlen(str); - if (ima_template) + if (template_setup_done) return 1; - ima_init_template_list(); + if (!ima_template) + ima_init_template_list(); /* * Verify that a template with the supplied name exists. @@ -84,6 +86,7 @@ static int __init ima_template_setup(char *str) } ima_template = template_desc; + template_setup_done = 1; return 1; } __setup("ima_template=", ima_template_setup); @@ -92,7 +95,7 @@ static int __init ima_template_fmt_setup(char *str) { int num_templates = ARRAY_SIZE(builtin_templates); - if (ima_template) + if (template_setup_done) return 1; if (template_desc_init_fields(str, NULL, NULL) < 0) { @@ -103,6 +106,7 @@ static int __init ima_template_fmt_setup(char *str) builtin_templates[num_templates - 1].fmt = str; ima_template = builtin_templates + num_templates - 1; + template_setup_done = 1; return 1; } diff --git a/security/integrity/ima/ima_template_lib.c b/security/integrity/ima/ima_template_lib.c index 43752002c2223ca0315237b3dc2bf5df9ac5c4c0..48c5a1be88ac4554b2b3d051c07b9d7d100055d3 100644 --- a/security/integrity/ima/ima_template_lib.c +++ b/security/integrity/ima/ima_template_lib.c @@ -284,6 +284,24 @@ int ima_eventdigest_init(struct ima_event_data *event_data, goto out; } + if ((const char *)event_data->filename == boot_aggregate_name) { + if (ima_tpm_chip) { + hash.hdr.algo = HASH_ALGO_SHA1; + result = ima_calc_boot_aggregate(&hash.hdr); + + /* algo can change depending on available PCR banks */ + if (!result && hash.hdr.algo != HASH_ALGO_SHA1) + result = -EINVAL; + + if (result < 0) + memset(&hash, 0, sizeof(hash)); + } + + cur_digest = hash.hdr.digest; + cur_digestsize = hash_digest_size[HASH_ALGO_SHA1]; + goto out; + } + if (!event_data->file) /* missing info to re-calculate the digest */ return -EINVAL; diff --git a/security/integrity/integrity_audit.c b/security/integrity/integrity_audit.c index 82c98f7d217e80dbce90de4c4532f44a50d7dea9..d03fbdfc972e82986a69773393ce387d65f89118 100644 --- a/security/integrity/integrity_audit.c +++ b/security/integrity/integrity_audit.c @@ -39,6 +39,8 @@ void integrity_audit_msg(int audit_msgno, struct inode *inode, return; ab = audit_log_start(audit_context(), GFP_KERNEL, audit_msgno); + if (!ab) + return; audit_log_format(ab, "pid=%d uid=%u auid=%u ses=%u", task_pid_nr(current), from_kuid(&init_user_ns, current_cred()->uid), diff --git a/security/security.c b/security/security.c index 9e4d6c999c79f8898d95f8e83ce491a04cae2d51..8a9e1ececd7d6a995c6257706bac6b58d0bd3b3e 100644 --- a/security/security.c +++ b/security/security.c @@ -232,25 +232,25 @@ EXPORT_SYMBOL(unregister_lsm_notifier); /* Security operations */ -int security_binder_set_context_mgr(struct task_struct *mgr) +int security_binder_set_context_mgr(const struct cred *mgr) { return call_int_hook(binder_set_context_mgr, 0, mgr); } -int security_binder_transaction(struct task_struct *from, - struct task_struct *to) +int security_binder_transaction(const struct cred *from, + const struct cred *to) { return call_int_hook(binder_transaction, 0, from, to); } -int security_binder_transfer_binder(struct task_struct *from, - struct task_struct *to) +int security_binder_transfer_binder(const struct cred *from, + const struct cred *to) { return call_int_hook(binder_transfer_binder, 0, from, to); } -int security_binder_transfer_file(struct task_struct *from, - struct task_struct *to, struct file *file) +int security_binder_transfer_file(const struct cred *from, + const struct cred *to, struct file *file) { return call_int_hook(binder_transfer_file, 0, from, to, file); } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 1b611c068669194e0c8ddda95588672151232ec6..056f5de53e7e31cb1dd6267e227eabd28ce3972d 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2218,22 +2218,19 @@ static inline u32 open_file_to_av(struct file *file) /* Hook functions begin here. */ -static int selinux_binder_set_context_mgr(struct task_struct *mgr) +static int selinux_binder_set_context_mgr(const struct cred *mgr) { - u32 mysid = current_sid(); - u32 mgrsid = task_sid(mgr); - return avc_has_perm(&selinux_state, - mysid, mgrsid, SECCLASS_BINDER, + current_sid(), cred_sid(mgr), SECCLASS_BINDER, BINDER__SET_CONTEXT_MGR, NULL); } -static int selinux_binder_transaction(struct task_struct *from, - struct task_struct *to) +static int selinux_binder_transaction(const struct cred *from, + const struct cred *to) { u32 mysid = current_sid(); - u32 fromsid = task_sid(from); - u32 tosid = task_sid(to); + u32 fromsid = cred_sid(from); + u32 tosid = cred_sid(to); int rc; if (mysid != fromsid) { @@ -2244,27 +2241,24 @@ static int selinux_binder_transaction(struct task_struct *from, return rc; } - return avc_has_perm(&selinux_state, - fromsid, tosid, SECCLASS_BINDER, BINDER__CALL, - NULL); + return avc_has_perm(&selinux_state, fromsid, tosid, + SECCLASS_BINDER, BINDER__CALL, NULL); } -static int selinux_binder_transfer_binder(struct task_struct *from, - struct task_struct *to) +static int selinux_binder_transfer_binder(const struct cred *from, + const struct cred *to) { - u32 fromsid = task_sid(from); - u32 tosid = task_sid(to); - return avc_has_perm(&selinux_state, - fromsid, tosid, SECCLASS_BINDER, BINDER__TRANSFER, + cred_sid(from), cred_sid(to), + SECCLASS_BINDER, BINDER__TRANSFER, NULL); } -static int selinux_binder_transfer_file(struct task_struct *from, - struct task_struct *to, +static int selinux_binder_transfer_file(const struct cred *from, + const struct cred *to, struct file *file) { - u32 sid = task_sid(to); + u32 sid = cred_sid(to); struct file_security_struct *fsec = file->f_security; struct dentry *dentry = file->f_path.dentry; struct inode_security_struct *isec; @@ -5799,7 +5793,7 @@ static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb, struct common_audit_data ad; struct lsm_network_audit net = {0,}; char *addrp; - u8 proto; + u8 proto = 0; if (sk == NULL) return NF_ACCEPT; diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 765df93de0c68d78d42e099ae3a4a4cf83d48271..04a8f0c1760932daad939dc2ce423ae61bb86da2 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -203,7 +203,7 @@ #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCEs for Spectre variant 2 */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ #define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index ecf5fc77f50b56dbc8b195b4ecb6a31ed696ae93..952273d7afd99f95d0bfee7bcaed1a8b37869b00 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1318,7 +1318,7 @@ static int update_insn_state_regs(struct instruction *insn, struct insn_state *s struct cfi_reg *cfa = &state->cfa; struct stack_op *op = &insn->stack_op; - if (cfa->base != CFI_SP) + if (cfa->base != CFI_SP && cfa->base != CFI_SP_INDIRECT) return 0; /* push */