From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gentoo-commits+bounces-1533825-garchives=archives.gentoo.org@lists.gentoo.org>
Received: from lists.gentoo.org (pigeon.gentoo.org [208.92.234.80])
	(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
	 key-exchange X25519 server-signature RSA-PSS (2048 bits))
	(No client certificate requested)
	by finch.gentoo.org (Postfix) with ESMTPS id B722615800D
	for <garchives@archives.gentoo.org>; Sat,  1 Jul 2023 18:27:51 +0000 (UTC)
Received: from pigeon.gentoo.org (localhost [127.0.0.1])
	by pigeon.gentoo.org (Postfix) with SMTP id EF2DAE0895;
	Sat,  1 Jul 2023 18:27:50 +0000 (UTC)
Received: from smtp.gentoo.org (smtp.gentoo.org [IPv6:2001:470:ea4a:1:5054:ff:fec7:86e4])
	(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
	 key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256)
	(No client certificate requested)
	by pigeon.gentoo.org (Postfix) with ESMTPS id ABAFFE0895
	for <gentoo-commits@lists.gentoo.org>; Sat,  1 Jul 2023 18:27:50 +0000 (UTC)
Received: from oystercatcher.gentoo.org (oystercatcher.gentoo.org [148.251.78.52])
	(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
	 key-exchange X25519 server-signature RSA-PSS (4096 bits))
	(No client certificate requested)
	by smtp.gentoo.org (Postfix) with ESMTPS id 87677340CB1
	for <gentoo-commits@lists.gentoo.org>; Sat,  1 Jul 2023 18:27:49 +0000 (UTC)
Received: from localhost.localdomain (localhost [IPv6:::1])
	by oystercatcher.gentoo.org (Postfix) with ESMTP id 234FBA6A
	for <gentoo-commits@lists.gentoo.org>; Sat,  1 Jul 2023 18:27:48 +0000 (UTC)
From: "Mike Pagano" <mpagano@gentoo.org>
To: gentoo-commits@lists.gentoo.org
Content-Transfer-Encoding: 8bit
Content-type: text/plain; charset=UTF-8
Reply-To: gentoo-dev@lists.gentoo.org, "Mike Pagano" <mpagano@gentoo.org>
Message-ID: <1688236045.bee1a172fdd848464d7ea0df4de48778e3243e3c.mpagano@gentoo>
Subject: [gentoo-commits] proj/linux-patches:6.1 commit in: /
X-VCS-Repository: proj/linux-patches
X-VCS-Files: 0000_README 1036_linux-6.1.37.patch
X-VCS-Directories: /
X-VCS-Committer: mpagano
X-VCS-Committer-Name: Mike Pagano
X-VCS-Revision: bee1a172fdd848464d7ea0df4de48778e3243e3c
X-VCS-Branch: 6.1
Date: Sat,  1 Jul 2023 18:27:48 +0000 (UTC)
Precedence: bulk
List-Post: <mailto:gentoo-commits@lists.gentoo.org>
List-Help: <mailto:gentoo-commits+help@lists.gentoo.org>
List-Unsubscribe: <mailto:gentoo-commits+unsubscribe@lists.gentoo.org>
List-Subscribe: <mailto:gentoo-commits+subscribe@lists.gentoo.org>
List-Id: Gentoo Linux mail <gentoo-commits.gentoo.org>
X-BeenThere: gentoo-commits@lists.gentoo.org
X-Auto-Response-Suppress: DR, RN, NRN, OOF, AutoReply
X-Archives-Salt: f2c28c39-3923-4dc0-80f9-49d588949a05
X-Archives-Hash: db215cd15848574bdd388fe9fbb78365

commit:     bee1a172fdd848464d7ea0df4de48778e3243e3c
Author:     Mike Pagano <mpagano <AT> gentoo <DOT> org>
AuthorDate: Sat Jul  1 18:27:25 2023 +0000
Commit:     Mike Pagano <mpagano <AT> gentoo <DOT> org>
CommitDate: Sat Jul  1 18:27:25 2023 +0000
URL:        https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=bee1a172

Linux patch 6.1.37-gentoo

Signed-off-by: Mike Pagano <mpagano <AT> gentoo.org>

 0000_README             |    4 +
 1036_linux-6.1.37.patch | 2681 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 2685 insertions(+)
diff --git a/0000_README b/0000_README
index 46c23470..d50ba74b 100644
--- a/0000_README
+++ b/0000_README
@@ -187,6 +187,10 @@ Patch:  1035_linux-6.1.36.patch
 From:   https://www.kernel.org
 Desc:   Linux 6.1.36
 
+Patch:  1036_linux-6.1.37.patch
+From:   https://www.kernel.org
+Desc:   Linux 6.1.37
+
 Patch:  1500_XATTR_USER_PREFIX.patch
 From:   https://bugs.gentoo.org/show_bug.cgi?id=470644
 Desc:   Support for namespace user.pax.* on tmpfs.

diff --git a/1036_linux-6.1.37.patch b/1036_linux-6.1.37.patch
new file mode 100644
index 00000000..62552bde
--- /dev/null
+++ b/1036_linux-6.1.37.patch
@@ -0,0 +1,2681 @@
+diff --git a/Makefile b/Makefile
+index cffb83d7a0fb6..26d73ce7e9267 100644
+--- a/Makefile
++++ b/Makefile
+@@ -1,7 +1,7 @@
+ # SPDX-License-Identifier: GPL-2.0
+ VERSION = 6
+ PATCHLEVEL = 1
+-SUBLEVEL = 36
++SUBLEVEL = 37
+ EXTRAVERSION =
+ NAME = Curry Ramen
+ 
+diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
+index 97fce7386b002..d95d82abdf295 100644
+--- a/arch/alpha/Kconfig
++++ b/arch/alpha/Kconfig
+@@ -28,6 +28,7 @@ config ALPHA
+ 	select GENERIC_SMP_IDLE_THREAD
+ 	select HAVE_ARCH_AUDITSYSCALL
+ 	select HAVE_MOD_ARCH_SPECIFIC
++	select LOCK_MM_AND_FIND_VMA
+ 	select MODULES_USE_ELF_RELA
+ 	select ODD_RT_SIGACTION
+ 	select OLD_SIGSUSPEND
+diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
+index ef427a6bdd1ab..2b49aa94e4de3 100644
+--- a/arch/alpha/mm/fault.c
++++ b/arch/alpha/mm/fault.c
+@@ -119,20 +119,12 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
+ 		flags |= FAULT_FLAG_USER;
+ 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-	mmap_read_lock(mm);
+-	vma = find_vma(mm, address);
++	vma = lock_mm_and_find_vma(mm, address, regs);
+ 	if (!vma)
+-		goto bad_area;
+-	if (vma->vm_start <= address)
+-		goto good_area;
+-	if (!(vma->vm_flags & VM_GROWSDOWN))
+-		goto bad_area;
+-	if (expand_stack(vma, address))
+-		goto bad_area;
++		goto bad_area_nosemaphore;
+ 
+ 	/* Ok, we have a good vm_area for this memory access, so
+ 	   we can handle it.  */
+- good_area:
+ 	si_code = SEGV_ACCERR;
+ 	if (cause < 0) {
+ 		if (!(vma->vm_flags & VM_EXEC))
+@@ -189,6 +181,7 @@ retry:
+  bad_area:
+ 	mmap_read_unlock(mm);
+ 
++ bad_area_nosemaphore:
+ 	if (user_mode(regs))
+ 		goto do_sigsegv;
+ 
+diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
+index d9a13ccf89a3a..cb1074f74c3f1 100644
+--- a/arch/arc/Kconfig
++++ b/arch/arc/Kconfig
+@@ -41,6 +41,7 @@ config ARC
+ 	select HAVE_PERF_EVENTS
+ 	select HAVE_SYSCALL_TRACEPOINTS
+ 	select IRQ_DOMAIN
++	select LOCK_MM_AND_FIND_VMA
+ 	select MODULES_USE_ELF_RELA
+ 	select OF
+ 	select OF_EARLY_FLATTREE
+diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
+index 5ca59a482632a..f59e722d147f9 100644
+--- a/arch/arc/mm/fault.c
++++ b/arch/arc/mm/fault.c
+@@ -113,15 +113,9 @@ void do_page_fault(unsigned long address, struct pt_regs *regs)
+ 
+ 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-	mmap_read_lock(mm);
+-
+-	vma = find_vma(mm, address);
++	vma = lock_mm_and_find_vma(mm, address, regs);
+ 	if (!vma)
+-		goto bad_area;
+-	if (unlikely(address < vma->vm_start)) {
+-		if (!(vma->vm_flags & VM_GROWSDOWN) || expand_stack(vma, address))
+-			goto bad_area;
+-	}
++		goto bad_area_nosemaphore;
+ 
+ 	/*
+ 	 * vm_area is good, now check permissions for this memory access
+@@ -161,6 +155,7 @@ retry:
+ bad_area:
+ 	mmap_read_unlock(mm);
+ 
++bad_area_nosemaphore:
+ 	/*
+ 	 * Major/minor page fault accounting
+ 	 * (in case of retry we only land here once)
+diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
+index a08c9d092a332..0202e48e7a207 100644
+--- a/arch/arm/Kconfig
++++ b/arch/arm/Kconfig
+@@ -122,6 +122,7 @@ config ARM
+ 	select HAVE_UID16
+ 	select HAVE_VIRT_CPU_ACCOUNTING_GEN
+ 	select IRQ_FORCED_THREADING
++	select LOCK_MM_AND_FIND_VMA
+ 	select MODULES_USE_ELF_REL
+ 	select NEED_DMA_MAP_STATE
+ 	select OF_EARLY_FLATTREE if OF
+diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
+index de988cba9a4b1..b0db853103317 100644
+--- a/arch/arm/mm/fault.c
++++ b/arch/arm/mm/fault.c
+@@ -231,37 +231,11 @@ static inline bool is_permission_fault(unsigned int fsr)
+ 	return false;
+ }
+ 
+-static vm_fault_t __kprobes
+-__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int flags,
+-		unsigned long vma_flags, struct pt_regs *regs)
+-{
+-	struct vm_area_struct *vma = find_vma(mm, addr);
+-	if (unlikely(!vma))
+-		return VM_FAULT_BADMAP;
+-
+-	if (unlikely(vma->vm_start > addr)) {
+-		if (!(vma->vm_flags & VM_GROWSDOWN))
+-			return VM_FAULT_BADMAP;
+-		if (addr < FIRST_USER_ADDRESS)
+-			return VM_FAULT_BADMAP;
+-		if (expand_stack(vma, addr))
+-			return VM_FAULT_BADMAP;
+-	}
+-
+-	/*
+-	 * ok, we have a good vm_area for this memory access, check the
+-	 * permissions on the VMA allow for the fault which occurred.
+-	 */
+-	if (!(vma->vm_flags & vma_flags))
+-		return VM_FAULT_BADACCESS;
+-
+-	return handle_mm_fault(vma, addr & PAGE_MASK, flags, regs);
+-}
+-
+ static int __kprobes
+ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
+ {
+ 	struct mm_struct *mm = current->mm;
++	struct vm_area_struct *vma;
+ 	int sig, code;
+ 	vm_fault_t fault;
+ 	unsigned int flags = FAULT_FLAG_DEFAULT;
+@@ -300,31 +274,21 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
+ 
+ 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
+ 
+-	/*
+-	 * As per x86, we may deadlock here.  However, since the kernel only
+-	 * validly references user space from well defined areas of the code,
+-	 * we can bug out early if this is from code which shouldn't.
+-	 */
+-	if (!mmap_read_trylock(mm)) {
+-		if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc))
+-			goto no_context;
+ retry:
+-		mmap_read_lock(mm);
+-	} else {
+-		/*
+-		 * The above down_read_trylock() might have succeeded in
+-		 * which case, we'll have missed the might_sleep() from
+-		 * down_read()
+-		 */
+-		might_sleep();
+-#ifdef CONFIG_DEBUG_VM
+-		if (!user_mode(regs) &&
+-		    !search_exception_tables(regs->ARM_pc))
+-			goto no_context;
+-#endif
++	vma = lock_mm_and_find_vma(mm, addr, regs);
++	if (unlikely(!vma)) {
++		fault = VM_FAULT_BADMAP;
++		goto bad_area;
+ 	}
+ 
+-	fault = __do_page_fault(mm, addr, flags, vm_flags, regs);
++	/*
++	 * ok, we have a good vm_area for this memory access, check the
++	 * permissions on the VMA allow for the fault which occurred.
++	 */
++	if (!(vma->vm_flags & vm_flags))
++		fault = VM_FAULT_BADACCESS;
++	else
++		fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, regs);
+ 
+ 	/* If we need to retry but a fatal signal is pending, handle the
+ 	 * signal first. We do not need to release the mmap_lock because
+@@ -355,6 +319,7 @@ retry:
+ 	if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
+ 		return 0;
+ 
++bad_area:
+ 	/*
+ 	 * If we are in kernel mode at this point, we
+ 	 * have no context to handle this fault with.
+diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
+index 43ff7c7a3ac97..20ee745c118ae 100644
+--- a/arch/arm64/Kconfig
++++ b/arch/arm64/Kconfig
+@@ -211,6 +211,7 @@ config ARM64
+ 	select IRQ_DOMAIN
+ 	select IRQ_FORCED_THREADING
+ 	select KASAN_VMALLOC if KASAN
++	select LOCK_MM_AND_FIND_VMA
+ 	select MODULES_USE_ELF_RELA
+ 	select NEED_DMA_MAP_STATE
+ 	select NEED_SG_DMA_LENGTH
+diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
+index 2fef1fa93e7b6..6b6b8a82f2941 100644
+--- a/arch/arm64/mm/fault.c
++++ b/arch/arm64/mm/fault.c
+@@ -483,27 +483,14 @@ static void do_bad_area(unsigned long far, unsigned long esr,
+ #define VM_FAULT_BADMAP		((__force vm_fault_t)0x010000)
+ #define VM_FAULT_BADACCESS	((__force vm_fault_t)0x020000)
+ 
+-static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
++static vm_fault_t __do_page_fault(struct mm_struct *mm,
++				  struct vm_area_struct *vma, unsigned long addr,
+ 				  unsigned int mm_flags, unsigned long vm_flags,
+ 				  struct pt_regs *regs)
+ {
+-	struct vm_area_struct *vma = find_vma(mm, addr);
+-
+-	if (unlikely(!vma))
+-		return VM_FAULT_BADMAP;
+-
+ 	/*
+ 	 * Ok, we have a good vm_area for this memory access, so we can handle
+ 	 * it.
+-	 */
+-	if (unlikely(vma->vm_start > addr)) {
+-		if (!(vma->vm_flags & VM_GROWSDOWN))
+-			return VM_FAULT_BADMAP;
+-		if (expand_stack(vma, addr))
+-			return VM_FAULT_BADMAP;
+-	}
+-
+-	/*
+ 	 * Check that the permissions on the VMA allow for the fault which
+ 	 * occurred.
+ 	 */
+@@ -535,6 +522,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
+ 	unsigned long vm_flags;
+ 	unsigned int mm_flags = FAULT_FLAG_DEFAULT;
+ 	unsigned long addr = untagged_addr(far);
++	struct vm_area_struct *vma;
+ 
+ 	if (kprobe_page_fault(regs, esr))
+ 		return 0;
+@@ -585,31 +573,14 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
+ 
+ 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
+ 
+-	/*
+-	 * As per x86, we may deadlock here. However, since the kernel only
+-	 * validly references user space from well defined areas of the code,
+-	 * we can bug out early if this is from code which shouldn't.
+-	 */
+-	if (!mmap_read_trylock(mm)) {
+-		if (!user_mode(regs) && !search_exception_tables(regs->pc))
+-			goto no_context;
+ retry:
+-		mmap_read_lock(mm);
+-	} else {
+-		/*
+-		 * The above mmap_read_trylock() might have succeeded in which
+-		 * case, we'll have missed the might_sleep() from down_read().
+-		 */
+-		might_sleep();
+-#ifdef CONFIG_DEBUG_VM
+-		if (!user_mode(regs) && !search_exception_tables(regs->pc)) {
+-			mmap_read_unlock(mm);
+-			goto no_context;
+-		}
+-#endif
++	vma = lock_mm_and_find_vma(mm, addr, regs);
++	if (unlikely(!vma)) {
++		fault = VM_FAULT_BADMAP;
++		goto done;
+ 	}
+ 
+-	fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs);
++	fault = __do_page_fault(mm, vma, addr, mm_flags, vm_flags, regs);
+ 
+ 	/* Quick path to respond to signals */
+ 	if (fault_signal_pending(fault, regs)) {
+@@ -628,6 +599,7 @@ retry:
+ 	}
+ 	mmap_read_unlock(mm);
+ 
++done:
+ 	/*
+ 	 * Handle the "normal" (no error) case first.
+ 	 */
+diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
+index adee6ab36862e..742009123fd5a 100644
+--- a/arch/csky/Kconfig
++++ b/arch/csky/Kconfig
+@@ -96,6 +96,7 @@ config CSKY
+ 	select HAVE_RSEQ
+ 	select HAVE_STACKPROTECTOR
+ 	select HAVE_SYSCALL_TRACEPOINTS
++	select LOCK_MM_AND_FIND_VMA
+ 	select MAY_HAVE_SPARSE_IRQ
+ 	select MODULES_USE_ELF_RELA if MODULES
+ 	select OF
+diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c
+index e15f736cca4b4..a885518ce1dd2 100644
+--- a/arch/csky/mm/fault.c
++++ b/arch/csky/mm/fault.c
+@@ -97,13 +97,12 @@ static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_f
+ 	BUG();
+ }
+ 
+-static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
++static inline void bad_area_nosemaphore(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
+ {
+ 	/*
+ 	 * Something tried to access memory that isn't in our memory map.
+ 	 * Fix it, but check if it's kernel or user first.
+ 	 */
+-	mmap_read_unlock(mm);
+ 	/* User mode accesses just cause a SIGSEGV */
+ 	if (user_mode(regs)) {
+ 		do_trap(regs, SIGSEGV, code, addr);
+@@ -238,20 +237,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs)
+ 	if (is_write(regs))
+ 		flags |= FAULT_FLAG_WRITE;
+ retry:
+-	mmap_read_lock(mm);
+-	vma = find_vma(mm, addr);
++	vma = lock_mm_and_find_vma(mm, addr, regs);
+ 	if (unlikely(!vma)) {
+-		bad_area(regs, mm, code, addr);
+-		return;
+-	}
+-	if (likely(vma->vm_start <= addr))
+-		goto good_area;
+-	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+-		bad_area(regs, mm, code, addr);
+-		return;
+-	}
+-	if (unlikely(expand_stack(vma, addr))) {
+-		bad_area(regs, mm, code, addr);
++		bad_area_nosemaphore(regs, mm, code, addr);
+ 		return;
+ 	}
+ 
+@@ -259,11 +247,11 @@ retry:
+ 	 * Ok, we have a good vm_area for this memory access, so
+ 	 * we can handle it.
+ 	 */
+-good_area:
+ 	code = SEGV_ACCERR;
+ 
+ 	if (unlikely(access_error(regs, vma))) {
+-		bad_area(regs, mm, code, addr);
++		mmap_read_unlock(mm);
++		bad_area_nosemaphore(regs, mm, code, addr);
+ 		return;
+ 	}
+ 
+diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
+index 54eadf2651786..6726f4941015f 100644
+--- a/arch/hexagon/Kconfig
++++ b/arch/hexagon/Kconfig
+@@ -28,6 +28,7 @@ config HEXAGON
+ 	select GENERIC_SMP_IDLE_THREAD
+ 	select STACKTRACE_SUPPORT
+ 	select GENERIC_CLOCKEVENTS_BROADCAST
++	select LOCK_MM_AND_FIND_VMA
+ 	select MODULES_USE_ELF_RELA
+ 	select GENERIC_CPU_DEVICES
+ 	select ARCH_WANT_LD_ORPHAN_WARN
+diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c
+index f73c7cbfe3260..583b087271667 100644
+--- a/arch/hexagon/mm/vm_fault.c
++++ b/arch/hexagon/mm/vm_fault.c
+@@ -57,21 +57,10 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs)
+ 
+ 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-	mmap_read_lock(mm);
+-	vma = find_vma(mm, address);
+-	if (!vma)
+-		goto bad_area;
++	vma = lock_mm_and_find_vma(mm, address, regs);
++	if (unlikely(!vma))
++		goto bad_area_nosemaphore;
+ 
+-	if (vma->vm_start <= address)
+-		goto good_area;
+-
+-	if (!(vma->vm_flags & VM_GROWSDOWN))
+-		goto bad_area;
+-
+-	if (expand_stack(vma, address))
+-		goto bad_area;
+-
+-good_area:
+ 	/* Address space is OK.  Now check access rights. */
+ 	si_code = SEGV_ACCERR;
+ 
+@@ -140,6 +129,7 @@ good_area:
+ bad_area:
+ 	mmap_read_unlock(mm);
+ 
++bad_area_nosemaphore:
+ 	if (user_mode(regs)) {
+ 		force_sig_fault(SIGSEGV, si_code, (void __user *)address);
+ 		return;
+diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
+index ef78c2d66cdde..99a09abe1d2c5 100644
+--- a/arch/ia64/mm/fault.c
++++ b/arch/ia64/mm/fault.c
+@@ -110,10 +110,12 @@ retry:
+          * register backing store that needs to expand upwards, in
+          * this case vma will be null, but prev_vma will ne non-null
+          */
+-        if (( !vma && prev_vma ) || (address < vma->vm_start) )
+-		goto check_expansion;
++        if (( !vma && prev_vma ) || (address < vma->vm_start) ) {
++		vma = expand_stack(mm, address);
++		if (!vma)
++			goto bad_area_nosemaphore;
++	}
+ 
+-  good_area:
+ 	code = SEGV_ACCERR;
+ 
+ 	/* OK, we've got a good vm_area for this memory area.  Check the access permissions: */
+@@ -174,35 +176,9 @@ retry:
+ 	mmap_read_unlock(mm);
+ 	return;
+ 
+-  check_expansion:
+-	if (!(prev_vma && (prev_vma->vm_flags & VM_GROWSUP) && (address == prev_vma->vm_end))) {
+-		if (!vma)
+-			goto bad_area;
+-		if (!(vma->vm_flags & VM_GROWSDOWN))
+-			goto bad_area;
+-		if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
+-		    || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
+-			goto bad_area;
+-		if (expand_stack(vma, address))
+-			goto bad_area;
+-	} else {
+-		vma = prev_vma;
+-		if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
+-		    || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
+-			goto bad_area;
+-		/*
+-		 * Since the register backing store is accessed sequentially,
+-		 * we disallow growing it by more than a page at a time.
+-		 */
+-		if (address > vma->vm_end + PAGE_SIZE - sizeof(long))
+-			goto bad_area;
+-		if (expand_upwards(vma, address))
+-			goto bad_area;
+-	}
+-	goto good_area;
+-
+   bad_area:
+ 	mmap_read_unlock(mm);
++  bad_area_nosemaphore:
+ 	if ((isr & IA64_ISR_SP)
+ 	    || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH))
+ 	{
+diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
+index 903096bd87f88..51d738ac12e55 100644
+--- a/arch/loongarch/Kconfig
++++ b/arch/loongarch/Kconfig
+@@ -107,6 +107,7 @@ config LOONGARCH
+ 	select HAVE_VIRT_CPU_ACCOUNTING_GEN if !SMP
+ 	select IRQ_FORCED_THREADING
+ 	select IRQ_LOONGARCH_CPU
++	select LOCK_MM_AND_FIND_VMA
+ 	select MMU_GATHER_MERGE_VMAS if MMU
+ 	select MODULES_USE_ELF_RELA if MODULES
+ 	select NEED_PER_CPU_EMBED_FIRST_CHUNK
+diff --git a/arch/loongarch/mm/fault.c b/arch/loongarch/mm/fault.c
+index 1ccd53655cab0..b829ab911a17b 100644
+--- a/arch/loongarch/mm/fault.c
++++ b/arch/loongarch/mm/fault.c
+@@ -166,22 +166,18 @@ static void __kprobes __do_page_fault(struct pt_regs *regs,
+ 
+ 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-	mmap_read_lock(mm);
+-	vma = find_vma(mm, address);
+-	if (!vma)
+-		goto bad_area;
+-	if (vma->vm_start <= address)
+-		goto good_area;
+-	if (!(vma->vm_flags & VM_GROWSDOWN))
+-		goto bad_area;
+-	if (!expand_stack(vma, address))
+-		goto good_area;
++	vma = lock_mm_and_find_vma(mm, address, regs);
++	if (unlikely(!vma))
++		goto bad_area_nosemaphore;
++	goto good_area;
++
+ /*
+  * Something tried to access memory that isn't in our memory map..
+  * Fix it, but check if it's kernel or user first..
+  */
+ bad_area:
+ 	mmap_read_unlock(mm);
++bad_area_nosemaphore:
+ 	do_sigsegv(regs, write, address, si_code);
+ 	return;
+ 
+diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
+index 4d2837eb3e2a3..6f62af8e293a0 100644
+--- a/arch/m68k/mm/fault.c
++++ b/arch/m68k/mm/fault.c
+@@ -105,8 +105,9 @@ retry:
+ 		if (address + 256 < rdusp())
+ 			goto map_err;
+ 	}
+-	if (expand_stack(vma, address))
+-		goto map_err;
++	vma = expand_stack(mm, address);
++	if (!vma)
++		goto map_err_nosemaphore;
+ 
+ /*
+  * Ok, we have a good vm_area for this memory access, so
+@@ -193,10 +194,12 @@ bus_err:
+ 	goto send_sig;
+ 
+ map_err:
++	mmap_read_unlock(mm);
++map_err_nosemaphore:
+ 	current->thread.signo = SIGSEGV;
+ 	current->thread.code = SEGV_MAPERR;
+ 	current->thread.faddr = address;
+-	goto send_sig;
++	return send_fault_sig(regs);
+ 
+ acc_err:
+ 	current->thread.signo = SIGSEGV;
+diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
+index 5c40c3ebe52f7..a409bb3f09f7f 100644
+--- a/arch/microblaze/mm/fault.c
++++ b/arch/microblaze/mm/fault.c
+@@ -192,8 +192,9 @@ retry:
+ 			&& (kernel_mode(regs) || !store_updates_sp(regs)))
+ 				goto bad_area;
+ 	}
+-	if (expand_stack(vma, address))
+-		goto bad_area;
++	vma = expand_stack(mm, address);
++	if (!vma)
++		goto bad_area_nosemaphore;
+ 
+ good_area:
+ 	code = SEGV_ACCERR;
+diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
+index 2f5835e300a8f..7b0856c76c9ad 100644
+--- a/arch/mips/Kconfig
++++ b/arch/mips/Kconfig
+@@ -94,6 +94,7 @@ config MIPS
+ 	select HAVE_VIRT_CPU_ACCOUNTING_GEN if 64BIT || !SMP
+ 	select IRQ_FORCED_THREADING
+ 	select ISA if EISA
++	select LOCK_MM_AND_FIND_VMA
+ 	select MODULES_USE_ELF_REL if MODULES
+ 	select MODULES_USE_ELF_RELA if MODULES && 64BIT
+ 	select PERF_USE_VMALLOC
+diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
+index a27045f5a556d..d7878208bd3fa 100644
+--- a/arch/mips/mm/fault.c
++++ b/arch/mips/mm/fault.c
+@@ -99,21 +99,13 @@ static void __do_page_fault(struct pt_regs *regs, unsigned long write,
+ 
+ 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-	mmap_read_lock(mm);
+-	vma = find_vma(mm, address);
++	vma = lock_mm_and_find_vma(mm, address, regs);
+ 	if (!vma)
+-		goto bad_area;
+-	if (vma->vm_start <= address)
+-		goto good_area;
+-	if (!(vma->vm_flags & VM_GROWSDOWN))
+-		goto bad_area;
+-	if (expand_stack(vma, address))
+-		goto bad_area;
++		goto bad_area_nosemaphore;
+ /*
+  * Ok, we have a good vm_area for this memory access, so
+  * we can handle it..
+  */
+-good_area:
+ 	si_code = SEGV_ACCERR;
+ 
+ 	if (write) {
+diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
+index a582f72104f39..1fb78865a4593 100644
+--- a/arch/nios2/Kconfig
++++ b/arch/nios2/Kconfig
+@@ -16,6 +16,7 @@ config NIOS2
+ 	select HAVE_ARCH_TRACEHOOK
+ 	select HAVE_ARCH_KGDB
+ 	select IRQ_DOMAIN
++	select LOCK_MM_AND_FIND_VMA
+ 	select MODULES_USE_ELF_RELA
+ 	select OF
+ 	select OF_EARLY_FLATTREE
+diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c
+index edaca0a6c1c1c..71939fb28c2e7 100644
+--- a/arch/nios2/mm/fault.c
++++ b/arch/nios2/mm/fault.c
+@@ -86,27 +86,14 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause,
+ 
+ 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ 
+-	if (!mmap_read_trylock(mm)) {
+-		if (!user_mode(regs) && !search_exception_tables(regs->ea))
+-			goto bad_area_nosemaphore;
+ retry:
+-		mmap_read_lock(mm);
+-	}
+-
+-	vma = find_vma(mm, address);
++	vma = lock_mm_and_find_vma(mm, address, regs);
+ 	if (!vma)
+-		goto bad_area;
+-	if (vma->vm_start <= address)
+-		goto good_area;
+-	if (!(vma->vm_flags & VM_GROWSDOWN))
+-		goto bad_area;
+-	if (expand_stack(vma, address))
+-		goto bad_area;
++		goto bad_area_nosemaphore;
+ /*
+  * Ok, we have a good vm_area for this memory access, so
+  * we can handle it..
+  */
+-good_area:
+ 	code = SEGV_ACCERR;
+ 
+ 	switch (cause) {
+diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
+index b4762d66e9efe..e3ad46d02fbdd 100644
+--- a/arch/openrisc/mm/fault.c
++++ b/arch/openrisc/mm/fault.c
+@@ -127,8 +127,9 @@ retry:
+ 		if (address + PAGE_SIZE < regs->sp)
+ 			goto bad_area;
+ 	}
+-	if (expand_stack(vma, address))
+-		goto bad_area;
++	vma = expand_stack(mm, address);
++	if (!vma)
++		goto bad_area_nosemaphore;
+ 
+ 	/*
+ 	 * Ok, we have a good vm_area for this memory access, so
+diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
+index 869204e97ec9d..b00aa98b582c2 100644
+--- a/arch/parisc/mm/fault.c
++++ b/arch/parisc/mm/fault.c
+@@ -288,15 +288,19 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
+ retry:
+ 	mmap_read_lock(mm);
+ 	vma = find_vma_prev(mm, address, &prev_vma);
+-	if (!vma || address < vma->vm_start)
+-		goto check_expansion;
++	if (!vma || address < vma->vm_start) {
++		if (!prev_vma || !(prev_vma->vm_flags & VM_GROWSUP))
++			goto bad_area;
++		vma = expand_stack(mm, address);
++		if (!vma)
++			goto bad_area_nosemaphore;
++	}
++
+ /*
+  * Ok, we have a good vm_area for this memory access. We still need to
+  * check the access permissions.
+  */
+ 
+-good_area:
+-
+ 	if ((vma->vm_flags & acc_type) != acc_type)
+ 		goto bad_area;
+ 
+@@ -342,17 +346,13 @@ good_area:
+ 	mmap_read_unlock(mm);
+ 	return;
+ 
+-check_expansion:
+-	vma = prev_vma;
+-	if (vma && (expand_stack(vma, address) == 0))
+-		goto good_area;
+-
+ /*
+  * Something tried to access memory that isn't in our memory map..
+  */
+ bad_area:
+ 	mmap_read_unlock(mm);
+ 
++bad_area_nosemaphore:
+ 	if (user_mode(regs)) {
+ 		int signo, si_code;
+ 
+@@ -444,7 +444,7 @@ handle_nadtlb_fault(struct pt_regs *regs)
+ {
+ 	unsigned long insn = regs->iir;
+ 	int breg, treg, xreg, val = 0;
+-	struct vm_area_struct *vma, *prev_vma;
++	struct vm_area_struct *vma;
+ 	struct task_struct *tsk;
+ 	struct mm_struct *mm;
+ 	unsigned long address;
+@@ -480,7 +480,7 @@ handle_nadtlb_fault(struct pt_regs *regs)
+ 				/* Search for VMA */
+ 				address = regs->ior;
+ 				mmap_read_lock(mm);
+-				vma = find_vma_prev(mm, address, &prev_vma);
++				vma = vma_lookup(mm, address);
+ 				mmap_read_unlock(mm);
+ 
+ 				/*
+@@ -489,7 +489,6 @@ handle_nadtlb_fault(struct pt_regs *regs)
+ 				 */
+ 				acc_type = (insn & 0x40) ? VM_WRITE : VM_READ;
+ 				if (vma
+-				    && address >= vma->vm_start
+ 				    && (vma->vm_flags & acc_type) == acc_type)
+ 					val = 1;
+ 			}
+diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
+index 2b1141645d9e1..6050e6e10d321 100644
+--- a/arch/powerpc/Kconfig
++++ b/arch/powerpc/Kconfig
+@@ -257,6 +257,7 @@ config PPC
+ 	select IRQ_DOMAIN
+ 	select IRQ_FORCED_THREADING
+ 	select KASAN_VMALLOC			if KASAN && MODULES
++	select LOCK_MM_AND_FIND_VMA
+ 	select MMU_GATHER_PAGE_SIZE
+ 	select MMU_GATHER_RCU_TABLE_FREE
+ 	select MMU_GATHER_MERGE_VMAS
+diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
+index 7c507fb48182b..f49fd873df8da 100644
+--- a/arch/powerpc/mm/copro_fault.c
++++ b/arch/powerpc/mm/copro_fault.c
+@@ -33,19 +33,11 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
+ 	if (mm->pgd == NULL)
+ 		return -EFAULT;
+ 
+-	mmap_read_lock(mm);
+-	ret = -EFAULT;
+-	vma = find_vma(mm, ea);
++	vma = lock_mm_and_find_vma(mm, ea, NULL);
+ 	if (!vma)
+-		goto out_unlock;
+-
+-	if (ea < vma->vm_start) {
+-		if (!(vma->vm_flags & VM_GROWSDOWN))
+-			goto out_unlock;
+-		if (expand_stack(vma, ea))
+-			goto out_unlock;
+-	}
++		return -EFAULT;
+ 
++	ret = -EFAULT;
+ 	is_write = dsisr & DSISR_ISSTORE;
+ 	if (is_write) {
+ 		if (!(vma->vm_flags & VM_WRITE))
+diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
+index af46aa88422bf..644e4ec6ce99d 100644
+--- a/arch/powerpc/mm/fault.c
++++ b/arch/powerpc/mm/fault.c
+@@ -84,11 +84,6 @@ static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code)
+ 	return __bad_area_nosemaphore(regs, address, si_code);
+ }
+ 
+-static noinline int bad_area(struct pt_regs *regs, unsigned long address)
+-{
+-	return __bad_area(regs, address, SEGV_MAPERR);
+-}
+-
+ static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address,
+ 				    struct vm_area_struct *vma)
+ {
+@@ -481,40 +476,12 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
+ 	 * we will deadlock attempting to validate the fault against the
+ 	 * address space.  Luckily the kernel only validly references user
+ 	 * space from well defined areas of code, which are listed in the
+-	 * exceptions table.
+-	 *
+-	 * As the vast majority of faults will be valid we will only perform
+-	 * the source reference check when there is a possibility of a deadlock.
+-	 * Attempt to lock the address space, if we cannot we then validate the
+-	 * source.  If this is invalid we can skip the address space check,
+-	 * thus avoiding the deadlock.
++	 * exceptions table. lock_mm_and_find_vma() handles that logic.
+ 	 */
+-	if (unlikely(!mmap_read_trylock(mm))) {
+-		if (!is_user && !search_exception_tables(regs->nip))
+-			return bad_area_nosemaphore(regs, address);
+-
+ retry:
+-		mmap_read_lock(mm);
+-	} else {
+-		/*
+-		 * The above down_read_trylock() might have succeeded in
+-		 * which case we'll have missed the might_sleep() from
+-		 * down_read():
+-		 */
+-		might_sleep();
+-	}
+-
+-	vma = find_vma(mm, address);
++	vma = lock_mm_and_find_vma(mm, address, regs);
+ 	if (unlikely(!vma))
+-		return bad_area(regs, address);
+-
+-	if (unlikely(vma->vm_start > address)) {
+-		if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
+-			return bad_area(regs, address);
+-
+-		if (unlikely(expand_stack(vma, address)))
+-			return bad_area(regs, address);
+-	}
++		return bad_area_nosemaphore(regs, address);
+ 
+ 	if (unlikely(access_pkey_error(is_write, is_exec,
+ 				       (error_code & DSISR_KEYFAULT), vma)))
+diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
+index a85bbe28dcf46..6bf8dc0b8f935 100644
+--- a/arch/riscv/Kconfig
++++ b/arch/riscv/Kconfig
+@@ -114,6 +114,7 @@ config RISCV
+ 	select HAVE_RSEQ
+ 	select IRQ_DOMAIN
+ 	select IRQ_FORCED_THREADING
++	select LOCK_MM_AND_FIND_VMA
+ 	select MODULES_USE_ELF_RELA if MODULES
+ 	select MODULE_SECTIONS if MODULES
+ 	select OF
+diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
+index eb0774d9c03b1..274bc6dd839fa 100644
+--- a/arch/riscv/mm/fault.c
++++ b/arch/riscv/mm/fault.c
+@@ -83,13 +83,13 @@ static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_f
+ 	BUG();
+ }
+ 
+-static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
++static inline void
++bad_area_nosemaphore(struct pt_regs *regs, int code, unsigned long addr)
+ {
+ 	/*
+ 	 * Something tried to access memory that isn't in our memory map.
+ 	 * Fix it, but check if it's kernel or user first.
+ 	 */
+-	mmap_read_unlock(mm);
+ 	/* User mode accesses just cause a SIGSEGV */
+ 	if (user_mode(regs)) {
+ 		do_trap(regs, SIGSEGV, code, addr);
+@@ -99,6 +99,15 @@ static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code
+ 	no_context(regs, addr);
+ }
+ 
++static inline void
++bad_area(struct pt_regs *regs, struct mm_struct *mm, int code,
++	 unsigned long addr)
++{
++	mmap_read_unlock(mm);
++
++	bad_area_nosemaphore(regs, code, addr);
++}
++
+ static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr)
+ {
+ 	pgd_t *pgd, *pgd_k;
+@@ -281,23 +290,10 @@ asmlinkage void do_page_fault(struct pt_regs *regs)
+ 	else if (cause == EXC_INST_PAGE_FAULT)
+ 		flags |= FAULT_FLAG_INSTRUCTION;
+ retry:
+-	mmap_read_lock(mm);
+-	vma = find_vma(mm, addr);
++	vma = lock_mm_and_find_vma(mm, addr, regs);
+ 	if (unlikely(!vma)) {
+ 		tsk->thread.bad_cause = cause;
+-		bad_area(regs, mm, code, addr);
+-		return;
+-	}
+-	if (likely(vma->vm_start <= addr))
+-		goto good_area;
+-	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+-		tsk->thread.bad_cause = cause;
+-		bad_area(regs, mm, code, addr);
+-		return;
+-	}
+-	if (unlikely(expand_stack(vma, addr))) {
+-		tsk->thread.bad_cause = cause;
+-		bad_area(regs, mm, code, addr);
++		bad_area_nosemaphore(regs, code, addr);
+ 		return;
+ 	}
+ 
+@@ -305,7 +301,6 @@ retry:
+ 	 * Ok, we have a good vm_area for this memory access, so
+ 	 * we can handle it.
+ 	 */
+-good_area:
+ 	code = SEGV_ACCERR;
+ 
+ 	if (unlikely(access_error(cause, vma))) {
+diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
+index 8e84ed2bb944e..2ab388179833e 100644
+--- a/arch/s390/mm/fault.c
++++ b/arch/s390/mm/fault.c
+@@ -429,8 +429,9 @@ retry:
+ 	if (unlikely(vma->vm_start > address)) {
+ 		if (!(vma->vm_flags & VM_GROWSDOWN))
+ 			goto out_up;
+-		if (expand_stack(vma, address))
+-			goto out_up;
++		vma = expand_stack(mm, address);
++		if (!vma)
++			goto out;
+ 	}
+ 
+ 	/*
+diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
+index 5f220e903e5ab..8e4d1f757bcc9 100644
+--- a/arch/sh/Kconfig
++++ b/arch/sh/Kconfig
+@@ -56,6 +56,7 @@ config SUPERH
+ 	select HAVE_STACKPROTECTOR
+ 	select HAVE_SYSCALL_TRACEPOINTS
+ 	select IRQ_FORCED_THREADING
++	select LOCK_MM_AND_FIND_VMA
+ 	select MODULES_USE_ELF_RELA
+ 	select NEED_SG_DMA_LENGTH
+ 	select NO_DMA if !MMU && !DMA_COHERENT
+diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
+index acd2f5e50bfcd..06e6b49529245 100644
+--- a/arch/sh/mm/fault.c
++++ b/arch/sh/mm/fault.c
+@@ -439,21 +439,9 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
+ 	}
+ 
+ retry:
+-	mmap_read_lock(mm);
+-
+-	vma = find_vma(mm, address);
++	vma = lock_mm_and_find_vma(mm, address, regs);
+ 	if (unlikely(!vma)) {
+-		bad_area(regs, error_code, address);
+-		return;
+-	}
+-	if (likely(vma->vm_start <= address))
+-		goto good_area;
+-	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+-		bad_area(regs, error_code, address);
+-		return;
+-	}
+-	if (unlikely(expand_stack(vma, address))) {
+-		bad_area(regs, error_code, address);
++		bad_area_nosemaphore(regs, error_code, address);
+ 		return;
+ 	}
+ 
+@@ -461,7 +449,6 @@ retry:
+ 	 * Ok, we have a good vm_area for this memory access, so
+ 	 * we can handle it..
+ 	 */
+-good_area:
+ 	if (unlikely(access_error(error_code, vma))) {
+ 		bad_area_access_error(regs, error_code, address);
+ 		return;
+diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
+index 84437a4c65454..dbb1760cbe8c9 100644
+--- a/arch/sparc/Kconfig
++++ b/arch/sparc/Kconfig
+@@ -56,6 +56,7 @@ config SPARC32
+ 	select DMA_DIRECT_REMAP
+ 	select GENERIC_ATOMIC64
+ 	select HAVE_UID16
++	select LOCK_MM_AND_FIND_VMA
+ 	select OLD_SIGACTION
+ 	select ZONE_DMA
+ 
+diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
+index 91259f291c540..56eb40d385a82 100644
+--- a/arch/sparc/mm/fault_32.c
++++ b/arch/sparc/mm/fault_32.c
+@@ -143,28 +143,19 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
+ 	if (pagefault_disabled() || !mm)
+ 		goto no_context;
+ 
++	if (!from_user && address >= PAGE_OFFSET)
++		goto no_context;
++
+ 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ 
+ retry:
+-	mmap_read_lock(mm);
+-
+-	if (!from_user && address >= PAGE_OFFSET)
+-		goto bad_area;
+-
+-	vma = find_vma(mm, address);
++	vma = lock_mm_and_find_vma(mm, address, regs);
+ 	if (!vma)
+-		goto bad_area;
+-	if (vma->vm_start <= address)
+-		goto good_area;
+-	if (!(vma->vm_flags & VM_GROWSDOWN))
+-		goto bad_area;
+-	if (expand_stack(vma, address))
+-		goto bad_area;
++		goto bad_area_nosemaphore;
+ 	/*
+ 	 * Ok, we have a good vm_area for this memory access, so
+ 	 * we can handle it..
+ 	 */
+-good_area:
+ 	code = SEGV_ACCERR;
+ 	if (write) {
+ 		if (!(vma->vm_flags & VM_WRITE))
+@@ -318,17 +309,9 @@ static void force_user_fault(unsigned long address, int write)
+ 
+ 	code = SEGV_MAPERR;
+ 
+-	mmap_read_lock(mm);
+-	vma = find_vma(mm, address);
++	vma = lock_mm_and_find_vma(mm, address, NULL);
+ 	if (!vma)
+-		goto bad_area;
+-	if (vma->vm_start <= address)
+-		goto good_area;
+-	if (!(vma->vm_flags & VM_GROWSDOWN))
+-		goto bad_area;
+-	if (expand_stack(vma, address))
+-		goto bad_area;
+-good_area:
++		goto bad_area_nosemaphore;
+ 	code = SEGV_ACCERR;
+ 	if (write) {
+ 		if (!(vma->vm_flags & VM_WRITE))
+@@ -347,6 +330,7 @@ good_area:
+ 	return;
+ bad_area:
+ 	mmap_read_unlock(mm);
++bad_area_nosemaphore:
+ 	__do_fault_siginfo(code, SIGSEGV, tsk->thread.kregs, address);
+ 	return;
+ 
+diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
+index 4acc12eafbf54..df685a2418550 100644
+--- a/arch/sparc/mm/fault_64.c
++++ b/arch/sparc/mm/fault_64.c
+@@ -383,8 +383,9 @@ continue_fault:
+ 				goto bad_area;
+ 		}
+ 	}
+-	if (expand_stack(vma, address))
+-		goto bad_area;
++	vma = expand_stack(mm, address);
++	if (!vma)
++		goto bad_area_nosemaphore;
+ 	/*
+ 	 * Ok, we have a good vm_area for this memory access, so
+ 	 * we can handle it..
+@@ -482,8 +483,9 @@ exit_exception:
+ 	 * Fix it, but check if it's kernel or user first..
+ 	 */
+ bad_area:
+-	insn = get_fault_insn(regs, insn);
+ 	mmap_read_unlock(mm);
++bad_area_nosemaphore:
++	insn = get_fault_insn(regs, insn);
+ 
+ handle_kernel_fault:
+ 	do_kernel_fault(regs, si_code, fault_code, insn, address);
+diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
+index d3ce21c4ca32a..6d8ae86ae978f 100644
+--- a/arch/um/kernel/trap.c
++++ b/arch/um/kernel/trap.c
+@@ -47,14 +47,15 @@ retry:
+ 	vma = find_vma(mm, address);
+ 	if (!vma)
+ 		goto out;
+-	else if (vma->vm_start <= address)
++	if (vma->vm_start <= address)
+ 		goto good_area;
+-	else if (!(vma->vm_flags & VM_GROWSDOWN))
++	if (!(vma->vm_flags & VM_GROWSDOWN))
+ 		goto out;
+-	else if (is_user && !ARCH_IS_STACKGROW(address))
+-		goto out;
+-	else if (expand_stack(vma, address))
++	if (is_user && !ARCH_IS_STACKGROW(address))
+ 		goto out;
++	vma = expand_stack(mm, address);
++	if (!vma)
++		goto out_nosemaphore;
+ 
+ good_area:
+ 	*code_out = SEGV_ACCERR;
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index b2c0fce3f257c..b3d5706579d43 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -271,6 +271,7 @@ config X86
+ 	select HAVE_GENERIC_VDSO
+ 	select HOTPLUG_SMT			if SMP
+ 	select IRQ_FORCED_THREADING
++	select LOCK_MM_AND_FIND_VMA
+ 	select NEED_PER_CPU_EMBED_FIRST_CHUNK
+ 	select NEED_PER_CPU_PAGE_FIRST_CHUNK
+ 	select NEED_SG_DMA_LENGTH
+diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
+index b472ef76826ad..37639a2d9c34f 100644
+--- a/arch/x86/include/asm/cpu.h
++++ b/arch/x86/include/asm/cpu.h
+@@ -96,4 +96,6 @@ static inline bool intel_cpu_signatures_match(unsigned int s1, unsigned int p1,
+ 
+ extern u64 x86_read_arch_cap_msr(void);
+ 
++extern struct cpumask cpus_stop_mask;
++
+ #endif /* _ASM_X86_CPU_H */
+diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
+index a73bced40e241..b3b34032ef234 100644
+--- a/arch/x86/include/asm/smp.h
++++ b/arch/x86/include/asm/smp.h
+@@ -132,6 +132,8 @@ void wbinvd_on_cpu(int cpu);
+ int wbinvd_on_all_cpus(void);
+ void cond_wakeup_cpu0(void);
+ 
++void smp_kick_mwait_play_dead(void);
++
+ void native_smp_send_reschedule(int cpu);
+ void native_send_call_func_ipi(const struct cpumask *mask);
+ void native_send_call_func_single_ipi(int cpu);
+diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
+index 461e45d85add9..9a3092ec9b274 100644
+--- a/arch/x86/kernel/cpu/microcode/amd.c
++++ b/arch/x86/kernel/cpu/microcode/amd.c
+@@ -705,7 +705,7 @@ static enum ucode_state apply_microcode_amd(int cpu)
+ 	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+ 
+ 	/* need to apply patch? */
+-	if (rev >= mc_amd->hdr.patch_id) {
++	if (rev > mc_amd->hdr.patch_id) {
+ 		ret = UCODE_OK;
+ 		goto out;
+ 	}
+diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
+index e436c9c1ef3b3..279b5e9be80fc 100644
+--- a/arch/x86/kernel/process.c
++++ b/arch/x86/kernel/process.c
+@@ -744,15 +744,26 @@ bool xen_set_default_idle(void)
+ }
+ #endif
+ 
++struct cpumask cpus_stop_mask;
++
+ void __noreturn stop_this_cpu(void *dummy)
+ {
++	struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info);
++	unsigned int cpu = smp_processor_id();
++
+ 	local_irq_disable();
++
+ 	/*
+-	 * Remove this CPU:
++	 * Remove this CPU from the online mask and disable it
++	 * unconditionally. This might be redundant in case that the reboot
++	 * vector was handled late and stop_other_cpus() sent an NMI.
++	 *
++	 * According to SDM and APM NMIs can be accepted even after soft
++	 * disabling the local APIC.
+ 	 */
+-	set_cpu_online(smp_processor_id(), false);
++	set_cpu_online(cpu, false);
+ 	disable_local_APIC();
+-	mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
++	mcheck_cpu_clear(c);
+ 
+ 	/*
+ 	 * Use wbinvd on processors that support SME. This provides support
+@@ -766,8 +777,17 @@ void __noreturn stop_this_cpu(void *dummy)
+ 	 * Test the CPUID bit directly because the machine might've cleared
+ 	 * X86_FEATURE_SME due to cmdline options.
+ 	 */
+-	if (cpuid_eax(0x8000001f) & BIT(0))
++	if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0)))
+ 		native_wbinvd();
++
++	/*
++	 * This brings a cache line back and dirties it, but
++	 * native_stop_other_cpus() will overwrite cpus_stop_mask after it
++	 * observed that all CPUs reported stop. This write will invalidate
++	 * the related cache line on this CPU.
++	 */
++	cpumask_clear_cpu(cpu, &cpus_stop_mask);
++
+ 	for (;;) {
+ 		/*
+ 		 * Use native_halt() so that memory contents don't change
+diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
+index 375b33ecafa27..174d6232b87fd 100644
+--- a/arch/x86/kernel/smp.c
++++ b/arch/x86/kernel/smp.c
+@@ -21,12 +21,14 @@
+ #include <linux/interrupt.h>
+ #include <linux/cpu.h>
+ #include <linux/gfp.h>
++#include <linux/kexec.h>
+ 
+ #include <asm/mtrr.h>
+ #include <asm/tlbflush.h>
+ #include <asm/mmu_context.h>
+ #include <asm/proto.h>
+ #include <asm/apic.h>
++#include <asm/cpu.h>
+ #include <asm/idtentry.h>
+ #include <asm/nmi.h>
+ #include <asm/mce.h>
+@@ -146,34 +148,47 @@ static int register_stop_handler(void)
+ 
+ static void native_stop_other_cpus(int wait)
+ {
+-	unsigned long flags;
+-	unsigned long timeout;
++	unsigned int cpu = smp_processor_id();
++	unsigned long flags, timeout;
+ 
+ 	if (reboot_force)
+ 		return;
+ 
+-	/*
+-	 * Use an own vector here because smp_call_function
+-	 * does lots of things not suitable in a panic situation.
+-	 */
++	/* Only proceed if this is the first CPU to reach this code */
++	if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1)
++		return;
++
++	/* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */
++	if (kexec_in_progress)
++		smp_kick_mwait_play_dead();
+ 
+ 	/*
+-	 * We start by using the REBOOT_VECTOR irq.
+-	 * The irq is treated as a sync point to allow critical
+-	 * regions of code on other cpus to release their spin locks
+-	 * and re-enable irqs.  Jumping straight to an NMI might
+-	 * accidentally cause deadlocks with further shutdown/panic
+-	 * code.  By syncing, we give the cpus up to one second to
+-	 * finish their work before we force them off with the NMI.
++	 * 1) Send an IPI on the reboot vector to all other CPUs.
++	 *
++	 *    The other CPUs should react on it after leaving critical
++	 *    sections and re-enabling interrupts. They might still hold
++	 *    locks, but there is nothing which can be done about that.
++	 *
++	 * 2) Wait for all other CPUs to report that they reached the
++	 *    HLT loop in stop_this_cpu()
++	 *
++	 * 3) If #2 timed out send an NMI to the CPUs which did not
++	 *    yet report
++	 *
++	 * 4) Wait for all other CPUs to report that they reached the
++	 *    HLT loop in stop_this_cpu()
++	 *
++	 * #3 can obviously race against a CPU reaching the HLT loop late.
++	 * That CPU will have reported already and the "have all CPUs
++	 * reached HLT" condition will be true despite the fact that the
++	 * other CPU is still handling the NMI. Again, there is no
++	 * protection against that as "disabled" APICs still respond to
++	 * NMIs.
+ 	 */
+-	if (num_online_cpus() > 1) {
+-		/* did someone beat us here? */
+-		if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
+-			return;
+-
+-		/* sync above data before sending IRQ */
+-		wmb();
++	cpumask_copy(&cpus_stop_mask, cpu_online_mask);
++	cpumask_clear_cpu(cpu, &cpus_stop_mask);
+ 
++	if (!cpumask_empty(&cpus_stop_mask)) {
+ 		apic_send_IPI_allbutself(REBOOT_VECTOR);
+ 
+ 		/*
+@@ -183,24 +198,22 @@ static void native_stop_other_cpus(int wait)
+ 		 * CPUs reach shutdown state.
+ 		 */
+ 		timeout = USEC_PER_SEC;
+-		while (num_online_cpus() > 1 && timeout--)
++		while (!cpumask_empty(&cpus_stop_mask) && timeout--)
+ 			udelay(1);
+ 	}
+ 
+ 	/* if the REBOOT_VECTOR didn't work, try with the NMI */
+-	if (num_online_cpus() > 1) {
++	if (!cpumask_empty(&cpus_stop_mask)) {
+ 		/*
+ 		 * If NMI IPI is enabled, try to register the stop handler
+ 		 * and send the IPI. In any case try to wait for the other
+ 		 * CPUs to stop.
+ 		 */
+ 		if (!smp_no_nmi_ipi && !register_stop_handler()) {
+-			/* Sync above data before sending IRQ */
+-			wmb();
+-
+ 			pr_emerg("Shutting down cpus with NMI\n");
+ 
+-			apic_send_IPI_allbutself(NMI_VECTOR);
++			for_each_cpu(cpu, &cpus_stop_mask)
++				apic->send_IPI(cpu, NMI_VECTOR);
+ 		}
+ 		/*
+ 		 * Don't wait longer than 10 ms if the caller didn't
+@@ -208,7 +221,7 @@ static void native_stop_other_cpus(int wait)
+ 		 * one or more CPUs do not reach shutdown state.
+ 		 */
+ 		timeout = USEC_PER_MSEC * 10;
+-		while (num_online_cpus() > 1 && (wait || timeout--))
++		while (!cpumask_empty(&cpus_stop_mask) && (wait || timeout--))
+ 			udelay(1);
+ 	}
+ 
+@@ -216,6 +229,12 @@ static void native_stop_other_cpus(int wait)
+ 	disable_local_APIC();
+ 	mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
+ 	local_irq_restore(flags);
++
++	/*
++	 * Ensure that the cpus_stop_mask cache lines are invalidated on
++	 * the other CPUs. See comment vs. SME in stop_this_cpu().
++	 */
++	cpumask_clear(&cpus_stop_mask);
+ }
+ 
+ /*
+diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
+index 3f3ea0287f694..f32ee967414e6 100644
+--- a/arch/x86/kernel/smpboot.c
++++ b/arch/x86/kernel/smpboot.c
+@@ -53,6 +53,7 @@
+ #include <linux/tboot.h>
+ #include <linux/gfp.h>
+ #include <linux/cpuidle.h>
++#include <linux/kexec.h>
+ #include <linux/numa.h>
+ #include <linux/pgtable.h>
+ #include <linux/overflow.h>
+@@ -99,6 +100,20 @@ EXPORT_PER_CPU_SYMBOL(cpu_die_map);
+ DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
+ EXPORT_PER_CPU_SYMBOL(cpu_info);
+ 
++struct mwait_cpu_dead {
++	unsigned int	control;
++	unsigned int	status;
++};
++
++#define CPUDEAD_MWAIT_WAIT	0xDEADBEEF
++#define CPUDEAD_MWAIT_KEXEC_HLT	0x4A17DEAD
++
++/*
++ * Cache line aligned data for mwait_play_dead(). Separate on purpose so
++ * that it's unlikely to be touched by other CPUs.
++ */
++static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead);
++
+ /* Logical package management. We might want to allocate that dynamically */
+ unsigned int __max_logical_packages __read_mostly;
+ EXPORT_SYMBOL(__max_logical_packages);
+@@ -155,6 +170,10 @@ static void smp_callin(void)
+ {
+ 	int cpuid;
+ 
++	/* Mop up eventual mwait_play_dead() wreckage */
++	this_cpu_write(mwait_cpu_dead.status, 0);
++	this_cpu_write(mwait_cpu_dead.control, 0);
++
+ 	/*
+ 	 * If waken up by an INIT in an 82489DX configuration
+ 	 * cpu_callout_mask guarantees we don't get here before
+@@ -1746,10 +1765,10 @@ EXPORT_SYMBOL_GPL(cond_wakeup_cpu0);
+  */
+ static inline void mwait_play_dead(void)
+ {
++	struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
+ 	unsigned int eax, ebx, ecx, edx;
+ 	unsigned int highest_cstate = 0;
+ 	unsigned int highest_subcstate = 0;
+-	void *mwait_ptr;
+ 	int i;
+ 
+ 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+@@ -1784,12 +1803,9 @@ static inline void mwait_play_dead(void)
+ 			(highest_subcstate - 1);
+ 	}
+ 
+-	/*
+-	 * This should be a memory location in a cache line which is
+-	 * unlikely to be touched by other processors.  The actual
+-	 * content is immaterial as it is not actually modified in any way.
+-	 */
+-	mwait_ptr = &current_thread_info()->flags;
++	/* Set up state for the kexec() hack below */
++	md->status = CPUDEAD_MWAIT_WAIT;
++	md->control = CPUDEAD_MWAIT_WAIT;
+ 
+ 	wbinvd();
+ 
+@@ -1802,16 +1818,63 @@ static inline void mwait_play_dead(void)
+ 		 * case where we return around the loop.
+ 		 */
+ 		mb();
+-		clflush(mwait_ptr);
++		clflush(md);
+ 		mb();
+-		__monitor(mwait_ptr, 0, 0);
++		__monitor(md, 0, 0);
+ 		mb();
+ 		__mwait(eax, 0);
+ 
++		if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
++			/*
++			 * Kexec is about to happen. Don't go back into mwait() as
++			 * the kexec kernel might overwrite text and data including
++			 * page tables and stack. So mwait() would resume when the
++			 * monitor cache line is written to and then the CPU goes
++			 * south due to overwritten text, page tables and stack.
++			 *
++			 * Note: This does _NOT_ protect against a stray MCE, NMI,
++			 * SMI. They will resume execution at the instruction
++			 * following the HLT instruction and run into the problem
++			 * which this is trying to prevent.
++			 */
++			WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
++			while(1)
++				native_halt();
++		}
++
+ 		cond_wakeup_cpu0();
+ 	}
+ }
+ 
++/*
++ * Kick all "offline" CPUs out of mwait on kexec(). See comment in
++ * mwait_play_dead().
++ */
++void smp_kick_mwait_play_dead(void)
++{
++	u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT;
++	struct mwait_cpu_dead *md;
++	unsigned int cpu, i;
++
++	for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) {
++		md = per_cpu_ptr(&mwait_cpu_dead, cpu);
++
++		/* Does it sit in mwait_play_dead() ? */
++		if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT)
++			continue;
++
++		/* Wait up to 5ms */
++		for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) {
++			/* Bring it out of mwait */
++			WRITE_ONCE(md->control, newstate);
++			udelay(5);
++		}
++
++		if (READ_ONCE(md->status) != newstate)
++			pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu);
++	}
++}
++
+ void hlt_play_dead(void)
+ {
+ 	if (__this_cpu_read(cpu_info.x86) >= 4)
+diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
+index 7b0d4ab894c8b..1dbbad73192a1 100644
+--- a/arch/x86/mm/fault.c
++++ b/arch/x86/mm/fault.c
+@@ -900,12 +900,6 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,
+ 	__bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
+ }
+ 
+-static noinline void
+-bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+-{
+-	__bad_area(regs, error_code, address, 0, SEGV_MAPERR);
+-}
+-
+ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
+ 		struct vm_area_struct *vma)
+ {
+@@ -1354,51 +1348,10 @@ void do_user_addr_fault(struct pt_regs *regs,
+ 	}
+ #endif
+ 
+-	/*
+-	 * Kernel-mode access to the user address space should only occur
+-	 * on well-defined single instructions listed in the exception
+-	 * tables.  But, an erroneous kernel fault occurring outside one of
+-	 * those areas which also holds mmap_lock might deadlock attempting
+-	 * to validate the fault against the address space.
+-	 *
+-	 * Only do the expensive exception table search when we might be at
+-	 * risk of a deadlock.  This happens if we
+-	 * 1. Failed to acquire mmap_lock, and
+-	 * 2. The access did not originate in userspace.
+-	 */
+-	if (unlikely(!mmap_read_trylock(mm))) {
+-		if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
+-			/*
+-			 * Fault from code in kernel from
+-			 * which we do not expect faults.
+-			 */
+-			bad_area_nosemaphore(regs, error_code, address);
+-			return;
+-		}
+ retry:
+-		mmap_read_lock(mm);
+-	} else {
+-		/*
+-		 * The above down_read_trylock() might have succeeded in
+-		 * which case we'll have missed the might_sleep() from
+-		 * down_read():
+-		 */
+-		might_sleep();
+-	}
+-
+-	vma = find_vma(mm, address);
++	vma = lock_mm_and_find_vma(mm, address, regs);
+ 	if (unlikely(!vma)) {
+-		bad_area(regs, error_code, address);
+-		return;
+-	}
+-	if (likely(vma->vm_start <= address))
+-		goto good_area;
+-	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+-		bad_area(regs, error_code, address);
+-		return;
+-	}
+-	if (unlikely(expand_stack(vma, address))) {
+-		bad_area(regs, error_code, address);
++		bad_area_nosemaphore(regs, error_code, address);
+ 		return;
+ 	}
+ 
+@@ -1406,7 +1359,6 @@ retry:
+ 	 * Ok, we have a good vm_area for this memory access, so
+ 	 * we can handle it..
+ 	 */
+-good_area:
+ 	if (unlikely(access_error(error_code, vma))) {
+ 		bad_area_access_error(regs, error_code, address, vma);
+ 		return;
+diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
+index bcb0c5d2abc2f..6d3c9257aa133 100644
+--- a/arch/xtensa/Kconfig
++++ b/arch/xtensa/Kconfig
+@@ -49,6 +49,7 @@ config XTENSA
+ 	select HAVE_SYSCALL_TRACEPOINTS
+ 	select HAVE_VIRT_CPU_ACCOUNTING_GEN
+ 	select IRQ_DOMAIN
++	select LOCK_MM_AND_FIND_VMA
+ 	select MODULES_USE_ELF_RELA
+ 	select PERF_USE_VMALLOC
+ 	select TRACE_IRQFLAGS_SUPPORT
+diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
+index 8c781b05c0bdd..d89b193c779f1 100644
+--- a/arch/xtensa/mm/fault.c
++++ b/arch/xtensa/mm/fault.c
+@@ -130,23 +130,14 @@ void do_page_fault(struct pt_regs *regs)
+ 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ 
+ retry:
+-	mmap_read_lock(mm);
+-	vma = find_vma(mm, address);
+-
++	vma = lock_mm_and_find_vma(mm, address, regs);
+ 	if (!vma)
+-		goto bad_area;
+-	if (vma->vm_start <= address)
+-		goto good_area;
+-	if (!(vma->vm_flags & VM_GROWSDOWN))
+-		goto bad_area;
+-	if (expand_stack(vma, address))
+-		goto bad_area;
++		goto bad_area_nosemaphore;
+ 
+ 	/* Ok, we have a good vm_area for this memory access, so
+ 	 * we can handle it..
+ 	 */
+ 
+-good_area:
+ 	code = SEGV_ACCERR;
+ 
+ 	if (is_write) {
+@@ -205,6 +196,7 @@ good_area:
+ 	 */
+ bad_area:
+ 	mmap_read_unlock(mm);
++bad_area_nosemaphore:
+ 	if (user_mode(regs)) {
+ 		current->thread.bad_vaddr = address;
+ 		current->thread.error_code = is_write;
+diff --git a/drivers/hid/hid-logitech-hidpp.c b/drivers/hid/hid-logitech-hidpp.c
+index b2cd7527de195..0b4204b9a253c 100644
+--- a/drivers/hid/hid-logitech-hidpp.c
++++ b/drivers/hid/hid-logitech-hidpp.c
+@@ -4348,7 +4348,7 @@ static const struct hid_device_id hidpp_devices[] = {
+ 	{ /* wireless touchpad T651 */
+ 	  HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH,
+ 		USB_DEVICE_ID_LOGITECH_T651),
+-	  .driver_data = HIDPP_QUIRK_CLASS_WTP },
++	  .driver_data = HIDPP_QUIRK_CLASS_WTP | HIDPP_QUIRK_DELAYED_INIT },
+ 	{ /* Mouse Logitech Anywhere MX */
+ 	  LDJ_DEVICE(0x1017), .driver_data = HIDPP_QUIRK_HI_RES_SCROLL_1P0 },
+ 	{ /* Mouse logitech M560 */
+diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c
+index 197b1e7bf029e..b617aada50b06 100644
+--- a/drivers/hid/hidraw.c
++++ b/drivers/hid/hidraw.c
+@@ -272,7 +272,12 @@ static int hidraw_open(struct inode *inode, struct file *file)
+ 		goto out;
+ 	}
+ 
+-	down_read(&minors_rwsem);
++	/*
++	 * Technically not writing to the hidraw_table but a write lock is
++	 * required to protect the device refcount. This is symmetrical to
++	 * hidraw_release().
++	 */
++	down_write(&minors_rwsem);
+ 	if (!hidraw_table[minor] || !hidraw_table[minor]->exist) {
+ 		err = -ENODEV;
+ 		goto out_unlock;
+@@ -301,7 +306,7 @@ static int hidraw_open(struct inode *inode, struct file *file)
+ 	spin_unlock_irqrestore(&hidraw_table[minor]->list_lock, flags);
+ 	file->private_data = list;
+ out_unlock:
+-	up_read(&minors_rwsem);
++	up_write(&minors_rwsem);
+ out:
+ 	if (err < 0)
+ 		kfree(list);
+diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c
+index 9c30dd30537af..15cd0cabee2a9 100644
+--- a/drivers/hid/wacom_wac.c
++++ b/drivers/hid/wacom_wac.c
+@@ -1309,7 +1309,7 @@ static void wacom_intuos_pro2_bt_pen(struct wacom_wac *wacom)
+ 	struct input_dev *pen_input = wacom->pen_input;
+ 	unsigned char *data = wacom->data;
+ 	int number_of_valid_frames = 0;
+-	int time_interval = 15000000;
++	ktime_t time_interval = 15000000;
+ 	ktime_t time_packet_received = ktime_get();
+ 	int i;
+ 
+@@ -1343,7 +1343,7 @@ static void wacom_intuos_pro2_bt_pen(struct wacom_wac *wacom)
+ 	if (number_of_valid_frames) {
+ 		if (wacom->hid_data.time_delayed)
+ 			time_interval = ktime_get() - wacom->hid_data.time_delayed;
+-		time_interval /= number_of_valid_frames;
++		time_interval = div_u64(time_interval, number_of_valid_frames);
+ 		wacom->hid_data.time_delayed = time_packet_received;
+ 	}
+ 
+@@ -1354,7 +1354,7 @@ static void wacom_intuos_pro2_bt_pen(struct wacom_wac *wacom)
+ 		bool range = frame[0] & 0x20;
+ 		bool invert = frame[0] & 0x10;
+ 		int frames_number_reversed = number_of_valid_frames - i - 1;
+-		int event_timestamp = time_packet_received - frames_number_reversed * time_interval;
++		ktime_t event_timestamp = time_packet_received - frames_number_reversed * time_interval;
+ 
+ 		if (!valid)
+ 			continue;
+diff --git a/drivers/hid/wacom_wac.h b/drivers/hid/wacom_wac.h
+index 1a40bb8c5810c..ee21bb260f22f 100644
+--- a/drivers/hid/wacom_wac.h
++++ b/drivers/hid/wacom_wac.h
+@@ -324,7 +324,7 @@ struct hid_data {
+ 	int ps_connected;
+ 	bool pad_input_event_flag;
+ 	unsigned short sequence_number;
+-	int time_delayed;
++	ktime_t time_delayed;
+ };
+ 
+ struct wacom_remote_data {
+diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c
+index 9f7fab49a5a90..75355ddca6575 100644
+--- a/drivers/iommu/amd/iommu_v2.c
++++ b/drivers/iommu/amd/iommu_v2.c
+@@ -485,8 +485,8 @@ static void do_fault(struct work_struct *work)
+ 	flags |= FAULT_FLAG_REMOTE;
+ 
+ 	mmap_read_lock(mm);
+-	vma = find_extend_vma(mm, address);
+-	if (!vma || address < vma->vm_start)
++	vma = vma_lookup(mm, address);
++	if (!vma)
+ 		/* failed to get a vma in the right range */
+ 		goto out;
+ 
+diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
+index 1df8c1dcae776..16e564cb393b7 100644
+--- a/drivers/iommu/io-pgfault.c
++++ b/drivers/iommu/io-pgfault.c
+@@ -89,7 +89,7 @@ iopf_handle_single(struct iopf_fault *iopf)
+ 
+ 	mmap_read_lock(mm);
+ 
+-	vma = find_extend_vma(mm, prm->addr);
++	vma = vma_lookup(mm, prm->addr);
+ 	if (!vma)
+ 		/* Unmapped area */
+ 		goto out_put_mm;
+diff --git a/drivers/thermal/mtk_thermal.c b/drivers/thermal/mtk_thermal.c
+index 62f1e691659e3..8440692e3890d 100644
+--- a/drivers/thermal/mtk_thermal.c
++++ b/drivers/thermal/mtk_thermal.c
+@@ -1028,12 +1028,7 @@ static int mtk_thermal_probe(struct platform_device *pdev)
+ 		return -ENODEV;
+ 	}
+ 
+-	auxadc_base = devm_of_iomap(&pdev->dev, auxadc, 0, NULL);
+-	if (IS_ERR(auxadc_base)) {
+-		of_node_put(auxadc);
+-		return PTR_ERR(auxadc_base);
+-	}
+-
++	auxadc_base = of_iomap(auxadc, 0);
+ 	auxadc_phys_base = of_get_phys_base(auxadc);
+ 
+ 	of_node_put(auxadc);
+@@ -1049,12 +1044,7 @@ static int mtk_thermal_probe(struct platform_device *pdev)
+ 		return -ENODEV;
+ 	}
+ 
+-	apmixed_base = devm_of_iomap(&pdev->dev, apmixedsys, 0, NULL);
+-	if (IS_ERR(apmixed_base)) {
+-		of_node_put(apmixedsys);
+-		return PTR_ERR(apmixed_base);
+-	}
+-
++	apmixed_base = of_iomap(apmixedsys, 0);
+ 	apmixed_phys_base = of_get_phys_base(apmixedsys);
+ 
+ 	of_node_put(apmixedsys);
+diff --git a/drivers/video/fbdev/core/sysimgblt.c b/drivers/video/fbdev/core/sysimgblt.c
+index 335e92b813fc4..665ef7a0a2495 100644
+--- a/drivers/video/fbdev/core/sysimgblt.c
++++ b/drivers/video/fbdev/core/sysimgblt.c
+@@ -189,7 +189,7 @@ static void fast_imageblit(const struct fb_image *image, struct fb_info *p,
+ 	u32 fgx = fgcolor, bgx = bgcolor, bpp = p->var.bits_per_pixel;
+ 	u32 ppw = 32/bpp, spitch = (image->width + 7)/8;
+ 	u32 bit_mask, eorx, shift;
+-	const char *s = image->data, *src;
++	const u8 *s = image->data, *src;
+ 	u32 *dst;
+ 	const u32 *tab;
+ 	size_t tablen;
+diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
+index 444302afc673a..e6c9c0e084486 100644
+--- a/fs/binfmt_elf.c
++++ b/fs/binfmt_elf.c
+@@ -315,10 +315,10 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
+ 	 * Grow the stack manually; some architectures have a limit on how
+ 	 * far ahead a user-space access may be in order to grow the stack.
+ 	 */
+-	if (mmap_read_lock_killable(mm))
++	if (mmap_write_lock_killable(mm))
+ 		return -EINTR;
+-	vma = find_extend_vma(mm, bprm->p);
+-	mmap_read_unlock(mm);
++	vma = find_extend_vma_locked(mm, bprm->p);
++	mmap_write_unlock(mm);
+ 	if (!vma)
+ 		return -EFAULT;
+ 
+diff --git a/fs/exec.c b/fs/exec.c
+index a0b1f0337a628..283012eb1aeb9 100644
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -198,33 +198,39 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+ 		int write)
+ {
+ 	struct page *page;
++	struct vm_area_struct *vma = bprm->vma;
++	struct mm_struct *mm = bprm->mm;
+ 	int ret;
+-	unsigned int gup_flags = FOLL_FORCE;
+ 
+-#ifdef CONFIG_STACK_GROWSUP
+-	if (write) {
+-		ret = expand_downwards(bprm->vma, pos);
+-		if (ret < 0)
++	/*
++	 * Avoid relying on expanding the stack down in GUP (which
++	 * does not work for STACK_GROWSUP anyway), and just do it
++	 * by hand ahead of time.
++	 */
++	if (write && pos < vma->vm_start) {
++		mmap_write_lock(mm);
++		ret = expand_downwards(vma, pos);
++		if (unlikely(ret < 0)) {
++			mmap_write_unlock(mm);
+ 			return NULL;
+-	}
+-#endif
+-
+-	if (write)
+-		gup_flags |= FOLL_WRITE;
++		}
++		mmap_write_downgrade(mm);
++	} else
++		mmap_read_lock(mm);
+ 
+ 	/*
+ 	 * We are doing an exec().  'current' is the process
+-	 * doing the exec and bprm->mm is the new process's mm.
++	 * doing the exec and 'mm' is the new process's mm.
+ 	 */
+-	mmap_read_lock(bprm->mm);
+-	ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
++	ret = get_user_pages_remote(mm, pos, 1,
++			write ? FOLL_WRITE : 0,
+ 			&page, NULL, NULL);
+-	mmap_read_unlock(bprm->mm);
++	mmap_read_unlock(mm);
+ 	if (ret <= 0)
+ 		return NULL;
+ 
+ 	if (write)
+-		acct_arg_size(bprm, vma_pages(bprm->vma));
++		acct_arg_size(bprm, vma_pages(vma));
+ 
+ 	return page;
+ }
+@@ -854,7 +860,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
+ 		stack_base = vma->vm_start - stack_expand;
+ #endif
+ 	current->mm->start_stack = bprm->p;
+-	ret = expand_stack(vma, stack_base);
++	ret = expand_stack_locked(vma, stack_base);
+ 	if (ret)
+ 		ret = -EFAULT;
+ 
+diff --git a/include/linux/highmem.h b/include/linux/highmem.h
+index e9912da5441b4..44242268f53bd 100644
+--- a/include/linux/highmem.h
++++ b/include/linux/highmem.h
+@@ -319,6 +319,32 @@ static inline void copy_user_highpage(struct page *to, struct page *from,
+ 
+ #endif
+ 
++#ifdef copy_mc_to_kernel
++static inline int copy_mc_user_highpage(struct page *to, struct page *from,
++					unsigned long vaddr, struct vm_area_struct *vma)
++{
++	unsigned long ret;
++	char *vfrom, *vto;
++
++	vfrom = kmap_local_page(from);
++	vto = kmap_local_page(to);
++	ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
++	if (!ret)
++		kmsan_unpoison_memory(page_address(to), PAGE_SIZE);
++	kunmap_local(vto);
++	kunmap_local(vfrom);
++
++	return ret;
++}
++#else
++static inline int copy_mc_user_highpage(struct page *to, struct page *from,
++					unsigned long vaddr, struct vm_area_struct *vma)
++{
++	copy_user_highpage(to, from, vaddr, vma);
++	return 0;
++}
++#endif
++
+ #ifndef __HAVE_ARCH_COPY_HIGHPAGE
+ 
+ static inline void copy_highpage(struct page *to, struct page *from)
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index e5e8acf8eb895..f5b6ef943ede2 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -1921,6 +1921,9 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
+ void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
+ int generic_error_remove_page(struct address_space *mapping, struct page *page);
+ 
++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
++		unsigned long address, struct pt_regs *regs);
++
+ #ifdef CONFIG_MMU
+ extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
+ 				  unsigned long address, unsigned int flags,
+@@ -2808,16 +2811,11 @@ extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
+ 
+ extern unsigned long stack_guard_gap;
+ /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
+-extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
++struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);
+ 
+ /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
+-extern int expand_downwards(struct vm_area_struct *vma,
+-		unsigned long address);
+-#if VM_GROWSUP
+-extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
+-#else
+-  #define expand_upwards(vma, address) (0)
+-#endif
++int expand_downwards(struct vm_area_struct *vma, unsigned long address);
+ 
+ /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
+ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
+@@ -2912,7 +2910,8 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
+ 			unsigned long start, unsigned long end);
+ #endif
+ 
+-struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
++		unsigned long addr);
+ int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
+ 			unsigned long pfn, unsigned long size, pgprot_t);
+ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+@@ -3295,7 +3294,6 @@ enum mf_flags {
+ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
+ 		      unsigned long count, int mf_flags);
+ extern int memory_failure(unsigned long pfn, int flags);
+-extern void memory_failure_queue(unsigned long pfn, int flags);
+ extern void memory_failure_queue_kick(int cpu);
+ extern int unpoison_memory(unsigned long pfn);
+ extern int sysctl_memory_failure_early_kill;
+@@ -3304,8 +3302,12 @@ extern void shake_page(struct page *p);
+ extern atomic_long_t num_poisoned_pages __read_mostly;
+ extern int soft_offline_page(unsigned long pfn, int flags);
+ #ifdef CONFIG_MEMORY_FAILURE
++extern void memory_failure_queue(unsigned long pfn, int flags);
+ extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags);
+ #else
++static inline void memory_failure_queue(unsigned long pfn, int flags)
++{
++}
+ static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+ {
+ 	return 0;
+diff --git a/lib/maple_tree.c b/lib/maple_tree.c
+index b72268848ade1..2f1f661157c92 100644
+--- a/lib/maple_tree.c
++++ b/lib/maple_tree.c
+@@ -4281,11 +4281,13 @@ done:
+ 
+ static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas)
+ {
+-	while ((wr_mas->mas->last > wr_mas->end_piv) &&
+-	       (wr_mas->offset_end < wr_mas->node_end))
+-		wr_mas->end_piv = wr_mas->pivots[++wr_mas->offset_end];
++	while ((wr_mas->offset_end < wr_mas->node_end) &&
++	       (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end]))
++		wr_mas->offset_end++;
+ 
+-	if (wr_mas->mas->last > wr_mas->end_piv)
++	if (wr_mas->offset_end < wr_mas->node_end)
++		wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end];
++	else
+ 		wr_mas->end_piv = wr_mas->mas->max;
+ }
+ 
+@@ -4442,7 +4444,6 @@ static inline void *mas_wr_store_entry(struct ma_wr_state *wr_mas)
+ 	}
+ 
+ 	/* At this point, we are at the leaf node that needs to be altered. */
+-	wr_mas->end_piv = wr_mas->r_max;
+ 	mas_wr_end_piv(wr_mas);
+ 
+ 	if (!wr_mas->entry)
+diff --git a/mm/Kconfig b/mm/Kconfig
+index 57e1d8c5b5052..35109a4a2f7ce 100644
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -1150,6 +1150,10 @@ config LRU_GEN_STATS
+ 	  This option has a per-memcg and per-node memory overhead.
+ # }
+ 
++config LOCK_MM_AND_FIND_VMA
++	bool
++	depends on !STACK_GROWSUP
++
+ source "mm/damon/Kconfig"
+ 
+ endmenu
+diff --git a/mm/gup.c b/mm/gup.c
+index 028f3b4e8c3f2..f4911ddd30707 100644
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -1182,7 +1182,7 @@ static long __get_user_pages(struct mm_struct *mm,
+ 
+ 		/* first iteration or cross vma bound */
+ 		if (!vma || start >= vma->vm_end) {
+-			vma = find_extend_vma(mm, start);
++			vma = vma_lookup(mm, start);
+ 			if (!vma && in_gate_area(mm, start)) {
+ 				ret = get_gate_page(mm, start & PAGE_MASK,
+ 						gup_flags, &vma,
+@@ -1351,8 +1351,8 @@ int fixup_user_fault(struct mm_struct *mm,
+ 		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ 
+ retry:
+-	vma = find_extend_vma(mm, address);
+-	if (!vma || address < vma->vm_start)
++	vma = vma_lookup(mm, address);
++	if (!vma)
+ 		return -EFAULT;
+ 
+ 	if (!vma_permits_fault(vma, fault_flags))
+diff --git a/mm/memory.c b/mm/memory.c
+index 747b7ea30f890..77549434d13a7 100644
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -2843,10 +2843,16 @@ static inline int pte_unmap_same(struct vm_fault *vmf)
+ 	return same;
+ }
+ 
+-static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
+-				       struct vm_fault *vmf)
++/*
++ * Return:
++ *	0:		copied succeeded
++ *	-EHWPOISON:	copy failed due to hwpoison in source page
++ *	-EAGAIN:	copied failed (some other reason)
++ */
++static inline int __wp_page_copy_user(struct page *dst, struct page *src,
++				      struct vm_fault *vmf)
+ {
+-	bool ret;
++	int ret;
+ 	void *kaddr;
+ 	void __user *uaddr;
+ 	bool locked = false;
+@@ -2855,8 +2861,11 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
+ 	unsigned long addr = vmf->address;
+ 
+ 	if (likely(src)) {
+-		copy_user_highpage(dst, src, addr, vma);
+-		return true;
++		if (copy_mc_user_highpage(dst, src, addr, vma)) {
++			memory_failure_queue(page_to_pfn(src), 0);
++			return -EHWPOISON;
++		}
++		return 0;
+ 	}
+ 
+ 	/*
+@@ -2883,7 +2892,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
+ 			 * and update local tlb only
+ 			 */
+ 			update_mmu_tlb(vma, addr, vmf->pte);
+-			ret = false;
++			ret = -EAGAIN;
+ 			goto pte_unlock;
+ 		}
+ 
+@@ -2908,7 +2917,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
+ 		if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ 			/* The PTE changed under us, update local tlb */
+ 			update_mmu_tlb(vma, addr, vmf->pte);
+-			ret = false;
++			ret = -EAGAIN;
+ 			goto pte_unlock;
+ 		}
+ 
+@@ -2927,7 +2936,7 @@ warn:
+ 		}
+ 	}
+ 
+-	ret = true;
++	ret = 0;
+ 
+ pte_unlock:
+ 	if (locked)
+@@ -3099,6 +3108,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
+ 	pte_t entry;
+ 	int page_copied = 0;
+ 	struct mmu_notifier_range range;
++	int ret;
+ 
+ 	delayacct_wpcopy_start();
+ 
+@@ -3116,19 +3126,21 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
+ 		if (!new_page)
+ 			goto oom;
+ 
+-		if (!__wp_page_copy_user(new_page, old_page, vmf)) {
++		ret = __wp_page_copy_user(new_page, old_page, vmf);
++		if (ret) {
+ 			/*
+ 			 * COW failed, if the fault was solved by other,
+ 			 * it's fine. If not, userspace would re-fault on
+ 			 * the same address and we will handle the fault
+ 			 * from the second attempt.
++			 * The -EHWPOISON case will not be retried.
+ 			 */
+ 			put_page(new_page);
+ 			if (old_page)
+ 				put_page(old_page);
+ 
+ 			delayacct_wpcopy_end();
+-			return 0;
++			return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
+ 		}
+ 		kmsan_copy_page_meta(new_page, old_page);
+ 	}
+@@ -5246,6 +5258,125 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
+ }
+ EXPORT_SYMBOL_GPL(handle_mm_fault);
+ 
++#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
++#include <linux/extable.h>
++
++static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
++{
++	/* Even if this succeeds, make it clear we *might* have slept */
++	if (likely(mmap_read_trylock(mm))) {
++		might_sleep();
++		return true;
++	}
++
++	if (regs && !user_mode(regs)) {
++		unsigned long ip = instruction_pointer(regs);
++		if (!search_exception_tables(ip))
++			return false;
++	}
++
++	return !mmap_read_lock_killable(mm);
++}
++
++static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
++{
++	/*
++	 * We don't have this operation yet.
++	 *
++	 * It should be easy enough to do: it's basically a
++	 *    atomic_long_try_cmpxchg_acquire()
++	 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
++	 * it also needs the proper lockdep magic etc.
++	 */
++	return false;
++}
++
++static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
++{
++	mmap_read_unlock(mm);
++	if (regs && !user_mode(regs)) {
++		unsigned long ip = instruction_pointer(regs);
++		if (!search_exception_tables(ip))
++			return false;
++	}
++	return !mmap_write_lock_killable(mm);
++}
++
++/*
++ * Helper for page fault handling.
++ *
++ * This is kind of equivalend to "mmap_read_lock()" followed
++ * by "find_extend_vma()", except it's a lot more careful about
++ * the locking (and will drop the lock on failure).
++ *
++ * For example, if we have a kernel bug that causes a page
++ * fault, we don't want to just use mmap_read_lock() to get
++ * the mm lock, because that would deadlock if the bug were
++ * to happen while we're holding the mm lock for writing.
++ *
++ * So this checks the exception tables on kernel faults in
++ * order to only do this all for instructions that are actually
++ * expected to fault.
++ *
++ * We can also actually take the mm lock for writing if we
++ * need to extend the vma, which helps the VM layer a lot.
++ */
++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
++			unsigned long addr, struct pt_regs *regs)
++{
++	struct vm_area_struct *vma;
++
++	if (!get_mmap_lock_carefully(mm, regs))
++		return NULL;
++
++	vma = find_vma(mm, addr);
++	if (likely(vma && (vma->vm_start <= addr)))
++		return vma;
++
++	/*
++	 * Well, dang. We might still be successful, but only
++	 * if we can extend a vma to do so.
++	 */
++	if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
++		mmap_read_unlock(mm);
++		return NULL;
++	}
++
++	/*
++	 * We can try to upgrade the mmap lock atomically,
++	 * in which case we can continue to use the vma
++	 * we already looked up.
++	 *
++	 * Otherwise we'll have to drop the mmap lock and
++	 * re-take it, and also look up the vma again,
++	 * re-checking it.
++	 */
++	if (!mmap_upgrade_trylock(mm)) {
++		if (!upgrade_mmap_lock_carefully(mm, regs))
++			return NULL;
++
++		vma = find_vma(mm, addr);
++		if (!vma)
++			goto fail;
++		if (vma->vm_start <= addr)
++			goto success;
++		if (!(vma->vm_flags & VM_GROWSDOWN))
++			goto fail;
++	}
++
++	if (expand_stack_locked(vma, addr))
++		goto fail;
++
++success:
++	mmap_write_downgrade(mm);
++	return vma;
++
++fail:
++	mmap_write_unlock(mm);
++	return NULL;
++}
++#endif
++
+ #ifndef __PAGETABLE_P4D_FOLDED
+ /*
+  * Allocate p4d page table.
+@@ -5517,6 +5648,14 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
+ 	if (mmap_read_lock_killable(mm))
+ 		return 0;
+ 
++	/* We might need to expand the stack to access it */
++	vma = vma_lookup(mm, addr);
++	if (!vma) {
++		vma = expand_stack(mm, addr);
++		if (!vma)
++			return 0;
++	}
++
+ 	/* ignore errors, just check how much was successfully transferred */
+ 	while (len) {
+ 		int bytes, ret, offset;
+diff --git a/mm/mmap.c b/mm/mmap.c
+index 14ca259189b77..b8af52db3bbe0 100644
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1945,7 +1945,7 @@ static int acct_stack_growth(struct vm_area_struct *vma,
+  * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+  * vma is the last one with address > vma->vm_end.  Have to extend vma.
+  */
+-int expand_upwards(struct vm_area_struct *vma, unsigned long address)
++static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
+ {
+ 	struct mm_struct *mm = vma->vm_mm;
+ 	struct vm_area_struct *next;
+@@ -2036,6 +2036,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
+ 
+ /*
+  * vma is the first one with address < vma->vm_start.  Have to extend vma.
++ * mmap_lock held for writing.
+  */
+ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
+ {
+@@ -2044,16 +2045,20 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
+ 	struct vm_area_struct *prev;
+ 	int error = 0;
+ 
++	if (!(vma->vm_flags & VM_GROWSDOWN))
++		return -EFAULT;
++
+ 	address &= PAGE_MASK;
+-	if (address < mmap_min_addr)
++	if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
+ 		return -EPERM;
+ 
+ 	/* Enforce stack_guard_gap */
+ 	prev = mas_prev(&mas, 0);
+ 	/* Check that both stack segments have the same anon_vma? */
+-	if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
+-			vma_is_accessible(prev)) {
+-		if (address - prev->vm_end < stack_guard_gap)
++	if (prev) {
++		if (!(prev->vm_flags & VM_GROWSDOWN) &&
++		    vma_is_accessible(prev) &&
++		    (address - prev->vm_end < stack_guard_gap))
+ 			return -ENOMEM;
+ 	}
+ 
+@@ -2132,13 +2137,12 @@ static int __init cmdline_parse_stack_guard_gap(char *p)
+ __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
+ 
+ #ifdef CONFIG_STACK_GROWSUP
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
+ {
+ 	return expand_upwards(vma, address);
+ }
+ 
+-struct vm_area_struct *
+-find_extend_vma(struct mm_struct *mm, unsigned long addr)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
+ {
+ 	struct vm_area_struct *vma, *prev;
+ 
+@@ -2146,20 +2150,23 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
+ 	vma = find_vma_prev(mm, addr, &prev);
+ 	if (vma && (vma->vm_start <= addr))
+ 		return vma;
+-	if (!prev || expand_stack(prev, addr))
++	if (!prev)
++		return NULL;
++	if (expand_stack_locked(prev, addr))
+ 		return NULL;
+ 	if (prev->vm_flags & VM_LOCKED)
+ 		populate_vma_page_range(prev, addr, prev->vm_end, NULL);
+ 	return prev;
+ }
+ #else
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
+ {
++	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
++		return -EINVAL;
+ 	return expand_downwards(vma, address);
+ }
+ 
+-struct vm_area_struct *
+-find_extend_vma(struct mm_struct *mm, unsigned long addr)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
+ {
+ 	struct vm_area_struct *vma;
+ 	unsigned long start;
+@@ -2170,10 +2177,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
+ 		return NULL;
+ 	if (vma->vm_start <= addr)
+ 		return vma;
+-	if (!(vma->vm_flags & VM_GROWSDOWN))
+-		return NULL;
+ 	start = vma->vm_start;
+-	if (expand_stack(vma, addr))
++	if (expand_stack_locked(vma, addr))
+ 		return NULL;
+ 	if (vma->vm_flags & VM_LOCKED)
+ 		populate_vma_page_range(vma, addr, start, NULL);
+@@ -2181,7 +2186,91 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
+ }
+ #endif
+ 
+-EXPORT_SYMBOL_GPL(find_extend_vma);
++/*
++ * IA64 has some horrid mapping rules: it can expand both up and down,
++ * but with various special rules.
++ *
++ * We'll get rid of this architecture eventually, so the ugliness is
++ * temporary.
++ */
++#ifdef CONFIG_IA64
++static inline bool vma_expand_ok(struct vm_area_struct *vma, unsigned long addr)
++{
++	return REGION_NUMBER(addr) == REGION_NUMBER(vma->vm_start) &&
++		REGION_OFFSET(addr) < RGN_MAP_LIMIT;
++}
++
++/*
++ * IA64 stacks grow down, but there's a special register backing store
++ * that can grow up. Only sequentially, though, so the new address must
++ * match vm_end.
++ */
++static inline int vma_expand_up(struct vm_area_struct *vma, unsigned long addr)
++{
++	if (!vma_expand_ok(vma, addr))
++		return -EFAULT;
++	if (vma->vm_end != (addr & PAGE_MASK))
++		return -EFAULT;
++	return expand_upwards(vma, addr);
++}
++
++static inline bool vma_expand_down(struct vm_area_struct *vma, unsigned long addr)
++{
++	if (!vma_expand_ok(vma, addr))
++		return -EFAULT;
++	return expand_downwards(vma, addr);
++}
++
++#elif defined(CONFIG_STACK_GROWSUP)
++
++#define vma_expand_up(vma,addr) expand_upwards(vma, addr)
++#define vma_expand_down(vma, addr) (-EFAULT)
++
++#else
++
++#define vma_expand_up(vma,addr) (-EFAULT)
++#define vma_expand_down(vma, addr) expand_downwards(vma, addr)
++
++#endif
++
++/*
++ * expand_stack(): legacy interface for page faulting. Don't use unless
++ * you have to.
++ *
++ * This is called with the mm locked for reading, drops the lock, takes
++ * the lock for writing, tries to look up a vma again, expands it if
++ * necessary, and downgrades the lock to reading again.
++ *
++ * If no vma is found or it can't be expanded, it returns NULL and has
++ * dropped the lock.
++ */
++struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
++{
++	struct vm_area_struct *vma, *prev;
++
++	mmap_read_unlock(mm);
++	if (mmap_write_lock_killable(mm))
++		return NULL;
++
++	vma = find_vma_prev(mm, addr, &prev);
++	if (vma && vma->vm_start <= addr)
++		goto success;
++
++	if (prev && !vma_expand_up(prev, addr)) {
++		vma = prev;
++		goto success;
++	}
++
++	if (vma && !vma_expand_down(vma, addr))
++		goto success;
++
++	mmap_write_unlock(mm);
++	return NULL;
++
++success:
++	mmap_write_downgrade(mm);
++	return vma;
++}
+ 
+ /*
+  * Ok - we have the memory areas we should free on a maple tree so release them,
+@@ -2311,19 +2400,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	return __split_vma(mm, vma, addr, new_below);
+ }
+ 
+-static inline int munmap_sidetree(struct vm_area_struct *vma,
+-				   struct ma_state *mas_detach)
+-{
+-	mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1);
+-	if (mas_store_gfp(mas_detach, vma, GFP_KERNEL))
+-		return -ENOMEM;
+-
+-	if (vma->vm_flags & VM_LOCKED)
+-		vma->vm_mm->locked_vm -= vma_pages(vma);
+-
+-	return 0;
+-}
+-
+ /*
+  * do_mas_align_munmap() - munmap the aligned region from @start to @end.
+  * @mas: The maple_state, ideally set up to alter the correct tree location.
+@@ -2345,6 +2421,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ 	struct maple_tree mt_detach;
+ 	int count = 0;
+ 	int error = -ENOMEM;
++	unsigned long locked_vm = 0;
+ 	MA_STATE(mas_detach, &mt_detach, 0, 0);
+ 	mt_init_flags(&mt_detach, mas->tree->ma_flags & MT_FLAGS_LOCK_MASK);
+ 	mt_set_external_lock(&mt_detach, &mm->mmap_lock);
+@@ -2403,18 +2480,24 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ 
+ 			mas_set(mas, end);
+ 			split = mas_prev(mas, 0);
+-			error = munmap_sidetree(split, &mas_detach);
++			mas_set_range(&mas_detach, split->vm_start, split->vm_end - 1);
++			error = mas_store_gfp(&mas_detach, split, GFP_KERNEL);
+ 			if (error)
+-				goto munmap_sidetree_failed;
++				goto munmap_gather_failed;
++			if (next->vm_flags & VM_LOCKED)
++				locked_vm += vma_pages(split);
+ 
+ 			count++;
+ 			if (vma == next)
+ 				vma = split;
+ 			break;
+ 		}
+-		error = munmap_sidetree(next, &mas_detach);
++		mas_set_range(&mas_detach, next->vm_start, next->vm_end - 1);
++		error = mas_store_gfp(&mas_detach, next, GFP_KERNEL);
+ 		if (error)
+-			goto munmap_sidetree_failed;
++			goto munmap_gather_failed;
++		if (next->vm_flags & VM_LOCKED)
++			locked_vm += vma_pages(next);
+ 
+ 		count++;
+ #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
+@@ -2463,7 +2546,10 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ 		mas_set_range(mas, start, end - 1);
+ 	}
+ #endif
++	/* Point of no return */
+ 	mas_store_prealloc(mas, NULL);
++
++	mm->locked_vm -= locked_vm;
+ 	mm->map_count -= count;
+ 	/*
+ 	 * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
+@@ -2490,7 +2576,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ 	return downgrade ? 1 : 0;
+ 
+ userfaultfd_error:
+-munmap_sidetree_failed:
++munmap_gather_failed:
+ end_split_failed:
+ 	__mt_destroy(&mt_detach);
+ start_split_failed:
+diff --git a/mm/nommu.c b/mm/nommu.c
+index 5b83938ecb67c..da26b3fec9ebe 100644
+--- a/mm/nommu.c
++++ b/mm/nommu.c
+@@ -682,23 +682,31 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+ EXPORT_SYMBOL(find_vma);
+ 
+ /*
+- * find a VMA
+- * - we don't extend stack VMAs under NOMMU conditions
++ * At least xtensa ends up having protection faults even with no
++ * MMU.. No stack expansion, at least.
+  */
+-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
++			unsigned long addr, struct pt_regs *regs)
+ {
+-	return find_vma(mm, addr);
++	mmap_read_lock(mm);
++	return vma_lookup(mm, addr);
+ }
+ 
+ /*
+  * expand a stack to a given address
+  * - not supported under NOMMU conditions
+  */
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long addr)
+ {
+ 	return -ENOMEM;
+ }
+ 
++struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
++{
++	mmap_read_unlock(mm);
++	return NULL;
++}
++
+ /*
+  * look up the first VMA exactly that exactly matches addr
+  * - should be called with mm->mmap_lock at least held readlocked
+diff --git a/net/can/isotp.c b/net/can/isotp.c
+index 1af623839bffa..b3c2a49b189cc 100644
+--- a/net/can/isotp.c
++++ b/net/can/isotp.c
+@@ -1079,8 +1079,9 @@ wait_free_buffer:
+ 		if (err)
+ 			goto err_event_drop;
+ 
+-		if (sk->sk_err)
+-			return -sk->sk_err;
++		err = sock_error(sk);
++		if (err)
++			return err;
+ 	}
+ 
+ 	return size;
+diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
+index 01d34ee4525ea..9127a7fd5269c 100644
+--- a/net/mptcp/pm_netlink.c
++++ b/net/mptcp/pm_netlink.c
+@@ -1039,6 +1039,7 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
+ 		return err;
+ 	}
+ 
++	inet_sk_state_store(newsk, TCP_LISTEN);
+ 	err = kernel_listen(ssock, backlog);
+ 	if (err) {
+ 		pr_warn("kernel_listen error, err=%d", err);
+diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
+index a2c6ce40e4269..4ca61e80f4bb2 100644
+--- a/net/mptcp/protocol.c
++++ b/net/mptcp/protocol.c
+@@ -2400,12 +2400,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
+ 		kfree_rcu(subflow, rcu);
+ 	} else {
+ 		/* otherwise tcp will dispose of the ssk and subflow ctx */
+-		if (ssk->sk_state == TCP_LISTEN) {
+-			tcp_set_state(ssk, TCP_CLOSE);
+-			mptcp_subflow_queue_clean(sk, ssk);
+-			inet_csk_listen_stop(ssk);
+-		}
+-
+ 		__tcp_close(ssk, 0);
+ 
+ 		/* close acquired an extra ref */
+@@ -2939,6 +2933,24 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
+ 	return EPOLLIN | EPOLLRDNORM;
+ }
+ 
++static void mptcp_check_listen_stop(struct sock *sk)
++{
++	struct sock *ssk;
++
++	if (inet_sk_state_load(sk) != TCP_LISTEN)
++		return;
++
++	ssk = mptcp_sk(sk)->first;
++	if (WARN_ON_ONCE(!ssk || inet_sk_state_load(ssk) != TCP_LISTEN))
++		return;
++
++	lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
++	mptcp_subflow_queue_clean(sk, ssk);
++	inet_csk_listen_stop(ssk);
++	tcp_set_state(ssk, TCP_CLOSE);
++	release_sock(ssk);
++}
++
+ bool __mptcp_close(struct sock *sk, long timeout)
+ {
+ 	struct mptcp_subflow_context *subflow;
+@@ -2949,6 +2961,7 @@ bool __mptcp_close(struct sock *sk, long timeout)
+ 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
+ 
+ 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) {
++		mptcp_check_listen_stop(sk);
+ 		inet_sk_state_store(sk, TCP_CLOSE);
+ 		goto cleanup;
+ 	}
+@@ -3062,6 +3075,7 @@ static int mptcp_disconnect(struct sock *sk, int flags)
+ 	if (msk->fastopening)
+ 		return -EBUSY;
+ 
++	mptcp_check_listen_stop(sk);
+ 	inet_sk_state_store(sk, TCP_CLOSE);
+ 
+ 	mptcp_stop_timer(sk);