public inbox for gentoo-commits@lists.gentoo.org
 help / color / mirror / Atom feed
From: "Mike Pagano" <mpagano@gentoo.org>
To: gentoo-commits@lists.gentoo.org
Subject: [gentoo-commits] proj/linux-patches:3.14 commit in: /
Date: Thu,  9 Oct 2014 23:03:27 +0000 (UTC)	[thread overview]
Message-ID: <1412895805.3854e739b3925782361de3d4306ab04a5318b1b5.mpagano@gentoo> (raw)

commit:     3854e739b3925782361de3d4306ab04a5318b1b5
Author:     Mike Pagano <mpagano <AT> gentoo <DOT> org>
AuthorDate: Thu Oct  9 23:03:25 2014 +0000
Commit:     Mike Pagano <mpagano <AT> gentoo <DOT> org>
CommitDate: Thu Oct  9 23:03:25 2014 +0000
URL:        http://sources.gentoo.org/gitweb/?p=proj/linux-patches.git;a=commit;h=3854e739

Linux patch 3.14.21

---
 0000_README              |    4 +
 1020_linux-3.14.21.patch | 2520 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 2524 insertions(+)

diff --git a/0000_README b/0000_README
index 10c4bc3..f450f73 100644
--- a/0000_README
+++ b/0000_README
@@ -122,6 +122,10 @@ Patch:  1019_linux-3.14.20.patch
 From:   http://www.kernel.org
 Desc:   Linux 3.14.20
 
+Patch:  1020_linux-3.14.21.patch
+From:   http://www.kernel.org
+Desc:   Linux 3.14.21
+
 Patch:  1500_XATTR_USER_PREFIX.patch
 From:   https://bugs.gentoo.org/show_bug.cgi?id=470644
 Desc:   Support for namespace user.pax.* on tmpfs.

diff --git a/1020_linux-3.14.21.patch b/1020_linux-3.14.21.patch
new file mode 100644
index 0000000..a8ab18d
--- /dev/null
+++ b/1020_linux-3.14.21.patch
@@ -0,0 +1,2520 @@
+diff --git a/Makefile b/Makefile
+index beb7e6f0803b..41e6e19fe2e9 100644
+--- a/Makefile
++++ b/Makefile
+@@ -1,6 +1,6 @@
+ VERSION = 3
+ PATCHLEVEL = 14
+-SUBLEVEL = 20
++SUBLEVEL = 21
+ EXTRAVERSION =
+ NAME = Remembering Coco
+ 
+diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
+index fb5e4c658f7a..ef470a7a3d0f 100644
+--- a/arch/unicore32/include/asm/mmu_context.h
++++ b/arch/unicore32/include/asm/mmu_context.h
+@@ -14,6 +14,8 @@
+ 
+ #include <linux/compiler.h>
+ #include <linux/sched.h>
++#include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/io.h>
+ 
+ #include <asm/cacheflush.h>
+@@ -73,7 +75,7 @@ do { \
+ 		else \
+ 			mm->mmap = NULL; \
+ 		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
+-		mm->mmap_cache = NULL; \
++		vmacache_invalidate(mm); \
+ 		mm->map_count--; \
+ 		remove_vma(high_vma); \
+ 	} \
+diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
+index c706d50a8b06..8c16c2f97026 100644
+--- a/drivers/block/drbd/drbd_nl.c
++++ b/drivers/block/drbd/drbd_nl.c
+@@ -525,6 +525,12 @@ void conn_try_outdate_peer_async(struct drbd_tconn *tconn)
+ 	struct task_struct *opa;
+ 
+ 	kref_get(&tconn->kref);
++	/* We may just have force_sig()'ed this thread
++	 * to get it out of some blocking network function.
++	 * Clear signals; otherwise kthread_run(), which internally uses
++	 * wait_on_completion_killable(), will mistake our pending signal
++	 * for a new fatal signal and fail. */
++	flush_signals(current);
+ 	opa = kthread_run(_try_outdate_peer_async, tconn, "drbd_async_h");
+ 	if (IS_ERR(opa)) {
+ 		conn_err(tconn, "out of mem, failed to invoke fence-peer helper\n");
+diff --git a/drivers/cpufreq/integrator-cpufreq.c b/drivers/cpufreq/integrator-cpufreq.c
+index 0e27844e8c2d..8089dd2cd9d8 100644
+--- a/drivers/cpufreq/integrator-cpufreq.c
++++ b/drivers/cpufreq/integrator-cpufreq.c
+@@ -213,9 +213,9 @@ static int __init integrator_cpufreq_probe(struct platform_device *pdev)
+ 	return cpufreq_register_driver(&integrator_driver);
+ }
+ 
+-static void __exit integrator_cpufreq_remove(struct platform_device *pdev)
++static int __exit integrator_cpufreq_remove(struct platform_device *pdev)
+ {
+-	cpufreq_unregister_driver(&integrator_driver);
++	return cpufreq_unregister_driver(&integrator_driver);
+ }
+ 
+ static const struct of_device_id integrator_cpufreq_match[] = {
+diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
+index d278be110805..1855cdca39cd 100644
+--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
++++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
+@@ -827,6 +827,16 @@ void i915_check_and_clear_faults(struct drm_device *dev)
+ 	POSTING_READ(RING_FAULT_REG(&dev_priv->ring[RCS]));
+ }
+ 
++static void i915_ggtt_flush(struct drm_i915_private *dev_priv)
++{
++	if (INTEL_INFO(dev_priv->dev)->gen < 6) {
++		intel_gtt_chipset_flush();
++	} else {
++		I915_WRITE(GFX_FLSH_CNTL_GEN6, GFX_FLSH_CNTL_EN);
++		POSTING_READ(GFX_FLSH_CNTL_GEN6);
++	}
++}
++
+ void i915_gem_suspend_gtt_mappings(struct drm_device *dev)
+ {
+ 	struct drm_i915_private *dev_priv = dev->dev_private;
+@@ -843,6 +853,8 @@ void i915_gem_suspend_gtt_mappings(struct drm_device *dev)
+ 				       dev_priv->gtt.base.start / PAGE_SIZE,
+ 				       dev_priv->gtt.base.total / PAGE_SIZE,
+ 				       true);
++
++	i915_ggtt_flush(dev_priv);
+ }
+ 
+ void i915_gem_restore_gtt_mappings(struct drm_device *dev)
+@@ -863,7 +875,7 @@ void i915_gem_restore_gtt_mappings(struct drm_device *dev)
+ 		i915_gem_gtt_bind_object(obj, obj->cache_level);
+ 	}
+ 
+-	i915_gem_chipset_flush(dev);
++	i915_ggtt_flush(dev_priv);
+ }
+ 
+ int i915_gem_gtt_prepare_object(struct drm_i915_gem_object *obj)
+diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
+index 18cda77b4f79..4913c0690872 100644
+--- a/drivers/md/raid5.c
++++ b/drivers/md/raid5.c
+@@ -64,6 +64,10 @@
+ #define cpu_to_group(cpu) cpu_to_node(cpu)
+ #define ANY_GROUP NUMA_NO_NODE
+ 
++static bool devices_handle_discard_safely = false;
++module_param(devices_handle_discard_safely, bool, 0644);
++MODULE_PARM_DESC(devices_handle_discard_safely,
++		 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
+ static struct workqueue_struct *raid5_wq;
+ /*
+  * Stripe cache
+@@ -6117,7 +6121,7 @@ static int run(struct mddev *mddev)
+ 		mddev->queue->limits.discard_granularity = stripe;
+ 		/*
+ 		 * unaligned part of discard request will be ignored, so can't
+-		 * guarantee discard_zerors_data
++		 * guarantee discard_zeroes_data
+ 		 */
+ 		mddev->queue->limits.discard_zeroes_data = 0;
+ 
+@@ -6142,6 +6146,18 @@ static int run(struct mddev *mddev)
+ 			    !bdev_get_queue(rdev->bdev)->
+ 						limits.discard_zeroes_data)
+ 				discard_supported = false;
++			/* Unfortunately, discard_zeroes_data is not currently
++			 * a guarantee - just a hint.  So we only allow DISCARD
++			 * if the sysadmin has confirmed that only safe devices
++			 * are in use by setting a module parameter.
++			 */
++			if (!devices_handle_discard_safely) {
++				if (discard_supported) {
++					pr_info("md/raid456: discard support disabled due to uncertainty.\n");
++					pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n");
++				}
++				discard_supported = false;
++			}
+ 		}
+ 
+ 		if (discard_supported &&
+diff --git a/drivers/media/v4l2-core/videobuf2-core.c b/drivers/media/v4l2-core/videobuf2-core.c
+index a127925c9d61..06faea4d60ee 100644
+--- a/drivers/media/v4l2-core/videobuf2-core.c
++++ b/drivers/media/v4l2-core/videobuf2-core.c
+@@ -745,6 +745,7 @@ static int __reqbufs(struct vb2_queue *q, struct v4l2_requestbuffers *req)
+ 	 * to the userspace.
+ 	 */
+ 	req->count = allocated_buffers;
++	q->waiting_for_buffers = !V4L2_TYPE_IS_OUTPUT(q->type);
+ 
+ 	return 0;
+ }
+@@ -793,6 +794,7 @@ static int __create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create
+ 		memset(q->plane_sizes, 0, sizeof(q->plane_sizes));
+ 		memset(q->alloc_ctx, 0, sizeof(q->alloc_ctx));
+ 		q->memory = create->memory;
++		q->waiting_for_buffers = !V4L2_TYPE_IS_OUTPUT(q->type);
+ 	}
+ 
+ 	num_buffers = min(create->count, VIDEO_MAX_FRAME - q->num_buffers);
+@@ -1447,6 +1449,7 @@ static int vb2_internal_qbuf(struct vb2_queue *q, struct v4l2_buffer *b)
+ 	 * dequeued in dqbuf.
+ 	 */
+ 	list_add_tail(&vb->queued_entry, &q->queued_list);
++	q->waiting_for_buffers = false;
+ 	vb->state = VB2_BUF_STATE_QUEUED;
+ 
+ 	/*
+@@ -1841,6 +1844,7 @@ static int vb2_internal_streamoff(struct vb2_queue *q, enum v4l2_buf_type type)
+ 	 * and videobuf, effectively returning control over them to userspace.
+ 	 */
+ 	__vb2_queue_cancel(q);
++	q->waiting_for_buffers = !V4L2_TYPE_IS_OUTPUT(q->type);
+ 
+ 	dprintk(3, "Streamoff successful\n");
+ 	return 0;
+@@ -2150,9 +2154,16 @@ unsigned int vb2_poll(struct vb2_queue *q, struct file *file, poll_table *wait)
+ 	}
+ 
+ 	/*
+-	 * There is nothing to wait for if no buffers have already been queued.
++	 * There is nothing to wait for if the queue isn't streaming.
+ 	 */
+-	if (list_empty(&q->queued_list))
++	if (!vb2_is_streaming(q))
++		return res | POLLERR;
++	/*
++	 * For compatibility with vb1: if QBUF hasn't been called yet, then
++	 * return POLLERR as well. This only affects capture queues, output
++	 * queues will always initialize waiting_for_buffers to false.
++	 */
++	if (q->waiting_for_buffers)
+ 		return res | POLLERR;
+ 
+ 	if (list_empty(&q->done_list))
+diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
+index f15d4353f30f..5d12d69e2045 100644
+--- a/fs/cifs/cifsglob.h
++++ b/fs/cifs/cifsglob.h
+@@ -399,6 +399,8 @@ struct smb_version_operations {
+ 			const struct cifs_fid *, u32 *);
+ 	int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *,
+ 			int);
++	/* check if we need to issue closedir */
++	bool (*dir_needs_close)(struct cifsFileInfo *);
+ };
+ 
+ struct smb_version_values {
+diff --git a/fs/cifs/file.c b/fs/cifs/file.c
+index 8175b18df819..d375322b6cec 100644
+--- a/fs/cifs/file.c
++++ b/fs/cifs/file.c
+@@ -762,7 +762,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
+ 
+ 	cifs_dbg(FYI, "Freeing private data in close dir\n");
+ 	spin_lock(&cifs_file_list_lock);
+-	if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) {
++	if (server->ops->dir_needs_close(cfile)) {
+ 		cfile->invalidHandle = true;
+ 		spin_unlock(&cifs_file_list_lock);
+ 		if (server->ops->close_dir)
+diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
+index 2bbf11b09214..b334a89d6a66 100644
+--- a/fs/cifs/readdir.c
++++ b/fs/cifs/readdir.c
+@@ -593,7 +593,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
+ 		/* close and restart search */
+ 		cifs_dbg(FYI, "search backing up - close and restart search\n");
+ 		spin_lock(&cifs_file_list_lock);
+-		if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) {
++		if (server->ops->dir_needs_close(cfile)) {
+ 			cfile->invalidHandle = true;
+ 			spin_unlock(&cifs_file_list_lock);
+ 			if (server->ops->close_dir)
+diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
+index d1fdfa848703..e9ad8d37bb00 100644
+--- a/fs/cifs/smb1ops.c
++++ b/fs/cifs/smb1ops.c
+@@ -586,7 +586,7 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
+ 		tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
+ 		if (tmprc == -EOPNOTSUPP)
+ 			*symlink = true;
+-		else
++		else if (tmprc == 0)
+ 			CIFSSMBClose(xid, tcon, fid.netfid);
+ 	}
+ 
+@@ -1009,6 +1009,12 @@ cifs_is_read_op(__u32 oplock)
+ 	return oplock == OPLOCK_READ;
+ }
+ 
++static bool
++cifs_dir_needs_close(struct cifsFileInfo *cfile)
++{
++	return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle;
++}
++
+ struct smb_version_operations smb1_operations = {
+ 	.send_cancel = send_nt_cancel,
+ 	.compare_fids = cifs_compare_fids,
+@@ -1078,6 +1084,7 @@ struct smb_version_operations smb1_operations = {
+ 	.query_mf_symlink = cifs_query_mf_symlink,
+ 	.create_mf_symlink = cifs_create_mf_symlink,
+ 	.is_read_op = cifs_is_read_op,
++	.dir_needs_close = cifs_dir_needs_close,
+ #ifdef CONFIG_CIFS_XATTR
+ 	.query_all_EAs = CIFSSMBQAllEAs,
+ 	.set_EA = CIFSSMBSetEA,
+diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
+index e31a9dfdcd39..a491814cb2c0 100644
+--- a/fs/cifs/smb2maperror.c
++++ b/fs/cifs/smb2maperror.c
+@@ -214,7 +214,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
+ 	{STATUS_BREAKPOINT, -EIO, "STATUS_BREAKPOINT"},
+ 	{STATUS_SINGLE_STEP, -EIO, "STATUS_SINGLE_STEP"},
+ 	{STATUS_BUFFER_OVERFLOW, -EIO, "STATUS_BUFFER_OVERFLOW"},
+-	{STATUS_NO_MORE_FILES, -EIO, "STATUS_NO_MORE_FILES"},
++	{STATUS_NO_MORE_FILES, -ENODATA, "STATUS_NO_MORE_FILES"},
+ 	{STATUS_WAKE_SYSTEM_DEBUGGER, -EIO, "STATUS_WAKE_SYSTEM_DEBUGGER"},
+ 	{STATUS_HANDLES_CLOSED, -EIO, "STATUS_HANDLES_CLOSED"},
+ 	{STATUS_NO_INHERITANCE, -EIO, "STATUS_NO_INHERITANCE"},
+@@ -256,6 +256,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
+ 	{STATUS_DLL_MIGHT_BE_INCOMPATIBLE, -EIO,
+ 	"STATUS_DLL_MIGHT_BE_INCOMPATIBLE"},
+ 	{STATUS_STOPPED_ON_SYMLINK, -EOPNOTSUPP, "STATUS_STOPPED_ON_SYMLINK"},
++	{STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EOPNOTSUPP,
++	"STATUS_REPARSE_NOT_HANDLED"},
+ 	{STATUS_DEVICE_REQUIRES_CLEANING, -EIO,
+ 	"STATUS_DEVICE_REQUIRES_CLEANING"},
+ 	{STATUS_DEVICE_DOOR_OPEN, -EIO, "STATUS_DEVICE_DOOR_OPEN"},
+diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
+index f8977b2d9187..34a17d425be6 100644
+--- a/fs/cifs/smb2ops.c
++++ b/fs/cifs/smb2ops.c
+@@ -1102,6 +1102,12 @@ smb3_parse_lease_buf(void *buf, unsigned int *epoch)
+ 	return le32_to_cpu(lc->lcontext.LeaseState);
+ }
+ 
++static bool
++smb2_dir_needs_close(struct cifsFileInfo *cfile)
++{
++	return !cfile->invalidHandle;
++}
++
+ struct smb_version_operations smb20_operations = {
+ 	.compare_fids = smb2_compare_fids,
+ 	.setup_request = smb2_setup_request,
+@@ -1175,6 +1181,7 @@ struct smb_version_operations smb20_operations = {
+ 	.create_lease_buf = smb2_create_lease_buf,
+ 	.parse_lease_buf = smb2_parse_lease_buf,
+ 	.clone_range = smb2_clone_range,
++	.dir_needs_close = smb2_dir_needs_close,
+ };
+ 
+ struct smb_version_operations smb21_operations = {
+@@ -1250,6 +1257,7 @@ struct smb_version_operations smb21_operations = {
+ 	.create_lease_buf = smb2_create_lease_buf,
+ 	.parse_lease_buf = smb2_parse_lease_buf,
+ 	.clone_range = smb2_clone_range,
++	.dir_needs_close = smb2_dir_needs_close,
+ };
+ 
+ struct smb_version_operations smb30_operations = {
+@@ -1328,6 +1336,7 @@ struct smb_version_operations smb30_operations = {
+ 	.parse_lease_buf = smb3_parse_lease_buf,
+ 	.clone_range = smb2_clone_range,
+ 	.validate_negotiate = smb3_validate_negotiate,
++	.dir_needs_close = smb2_dir_needs_close,
+ };
+ 
+ struct smb_version_values smb20_values = {
+diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
+index 9aab8fe0e508..348792911e1f 100644
+--- a/fs/cifs/smb2pdu.c
++++ b/fs/cifs/smb2pdu.c
+@@ -2136,6 +2136,10 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
+ 	rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base;
+ 
+ 	if (rc) {
++		if (rc == -ENODATA && rsp->hdr.Status == STATUS_NO_MORE_FILES) {
++			srch_inf->endOfSearch = true;
++			rc = 0;
++		}
+ 		cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
+ 		goto qdir_exit;
+ 	}
+@@ -2173,11 +2177,6 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
+ 	else
+ 		cifs_dbg(VFS, "illegal search buffer type\n");
+ 
+-	if (rsp->hdr.Status == STATUS_NO_MORE_FILES)
+-		srch_inf->endOfSearch = 1;
+-	else
+-		srch_inf->endOfSearch = 0;
+-
+ 	return rc;
+ 
+ qdir_exit:
+diff --git a/fs/exec.c b/fs/exec.c
+index 31e46b1b358b..ea4449d0536a 100644
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -26,6 +26,7 @@
+ #include <linux/file.h>
+ #include <linux/fdtable.h>
+ #include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/stat.h>
+ #include <linux/fcntl.h>
+ #include <linux/swap.h>
+@@ -820,7 +821,7 @@ EXPORT_SYMBOL(read_code);
+ static int exec_mmap(struct mm_struct *mm)
+ {
+ 	struct task_struct *tsk;
+-	struct mm_struct * old_mm, *active_mm;
++	struct mm_struct *old_mm, *active_mm;
+ 
+ 	/* Notify parent that we're no longer interested in the old VM */
+ 	tsk = current;
+@@ -846,6 +847,8 @@ static int exec_mmap(struct mm_struct *mm)
+ 	tsk->mm = mm;
+ 	tsk->active_mm = mm;
+ 	activate_mm(active_mm, mm);
++	tsk->mm->vmacache_seqnum = 0;
++	vmacache_flush(tsk);
+ 	task_unlock(tsk);
+ 	if (old_mm) {
+ 		up_read(&old_mm->mmap_sem);
+diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
+index d19b30ababf1..a4a8ed56e438 100644
+--- a/fs/hugetlbfs/inode.c
++++ b/fs/hugetlbfs/inode.c
+@@ -1017,6 +1017,11 @@ static int __init init_hugetlbfs_fs(void)
+ 	int error;
+ 	int i;
+ 
++	if (!hugepages_supported()) {
++		pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n");
++		return -ENOTSUPP;
++	}
++
+ 	error = bdi_init(&hugetlbfs_backing_dev_info);
+ 	if (error)
+ 		return error;
+diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
+index 8f788193e3d4..c4b2646b6d7c 100644
+--- a/fs/proc/task_mmu.c
++++ b/fs/proc/task_mmu.c
+@@ -1,4 +1,5 @@
+ #include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/hugetlb.h>
+ #include <linux/huge_mm.h>
+ #include <linux/mount.h>
+@@ -152,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
+ 
+ 	/*
+ 	 * We remember last_addr rather than next_addr to hit with
+-	 * mmap_cache most of the time. We have zero last_addr at
++	 * vmacache most of the time. We have zero last_addr at
+ 	 * the beginning and also after lseek. We will have -1 last_addr
+ 	 * after the end of the vmas.
+ 	 */
+diff --git a/fs/udf/inode.c b/fs/udf/inode.c
+index 982ce05c87ed..287cd5f23421 100644
+--- a/fs/udf/inode.c
++++ b/fs/udf/inode.c
+@@ -1271,13 +1271,22 @@ update_time:
+ 	return 0;
+ }
+ 
++/*
++ * Maximum length of linked list formed by ICB hierarchy. The chosen number is
++ * arbitrary - just that we hopefully don't limit any real use of rewritten
++ * inode on write-once media but avoid looping for too long on corrupted media.
++ */
++#define UDF_MAX_ICB_NESTING 1024
++
+ static void __udf_read_inode(struct inode *inode)
+ {
+ 	struct buffer_head *bh = NULL;
+ 	struct fileEntry *fe;
+ 	uint16_t ident;
+ 	struct udf_inode_info *iinfo = UDF_I(inode);
++	unsigned int indirections = 0;
+ 
++reread:
+ 	/*
+ 	 * Set defaults, but the inode is still incomplete!
+ 	 * Note: get_new_inode() sets the following on a new inode:
+@@ -1314,28 +1323,26 @@ static void __udf_read_inode(struct inode *inode)
+ 		ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
+ 					&ident);
+ 		if (ident == TAG_IDENT_IE && ibh) {
+-			struct buffer_head *nbh = NULL;
+ 			struct kernel_lb_addr loc;
+ 			struct indirectEntry *ie;
+ 
+ 			ie = (struct indirectEntry *)ibh->b_data;
+ 			loc = lelb_to_cpu(ie->indirectICB.extLocation);
+ 
+-			if (ie->indirectICB.extLength &&
+-				(nbh = udf_read_ptagged(inode->i_sb, &loc, 0,
+-							&ident))) {
+-				if (ident == TAG_IDENT_FE ||
+-					ident == TAG_IDENT_EFE) {
+-					memcpy(&iinfo->i_location,
+-						&loc,
+-						sizeof(struct kernel_lb_addr));
+-					brelse(bh);
+-					brelse(ibh);
+-					brelse(nbh);
+-					__udf_read_inode(inode);
++			if (ie->indirectICB.extLength) {
++				brelse(bh);
++				brelse(ibh);
++				memcpy(&iinfo->i_location, &loc,
++				       sizeof(struct kernel_lb_addr));
++				if (++indirections > UDF_MAX_ICB_NESTING) {
++					udf_err(inode->i_sb,
++						"too many ICBs in ICB hierarchy"
++						" (max %d supported)\n",
++						UDF_MAX_ICB_NESTING);
++					make_bad_inode(inode);
+ 					return;
+ 				}
+-				brelse(nbh);
++				goto reread;
+ 			}
+ 		}
+ 		brelse(ibh);
+diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
+index 3fe661fe96d1..b19d3dc2e651 100644
+--- a/include/linux/cpuset.h
++++ b/include/linux/cpuset.h
+@@ -87,25 +87,26 @@ extern void rebuild_sched_domains(void);
+ extern void cpuset_print_task_mems_allowed(struct task_struct *p);
+ 
+ /*
+- * get_mems_allowed is required when making decisions involving mems_allowed
+- * such as during page allocation. mems_allowed can be updated in parallel
+- * and depending on the new value an operation can fail potentially causing
+- * process failure. A retry loop with get_mems_allowed and put_mems_allowed
+- * prevents these artificial failures.
++ * read_mems_allowed_begin is required when making decisions involving
++ * mems_allowed such as during page allocation. mems_allowed can be updated in
++ * parallel and depending on the new value an operation can fail potentially
++ * causing process failure. A retry loop with read_mems_allowed_begin and
++ * read_mems_allowed_retry prevents these artificial failures.
+  */
+-static inline unsigned int get_mems_allowed(void)
++static inline unsigned int read_mems_allowed_begin(void)
+ {
+ 	return read_seqcount_begin(&current->mems_allowed_seq);
+ }
+ 
+ /*
+- * If this returns false, the operation that took place after get_mems_allowed
+- * may have failed. It is up to the caller to retry the operation if
++ * If this returns true, the operation that took place after
++ * read_mems_allowed_begin may have failed artificially due to a concurrent
++ * update of mems_allowed. It is up to the caller to retry the operation if
+  * appropriate.
+  */
+-static inline bool put_mems_allowed(unsigned int seq)
++static inline bool read_mems_allowed_retry(unsigned int seq)
+ {
+-	return !read_seqcount_retry(&current->mems_allowed_seq, seq);
++	return read_seqcount_retry(&current->mems_allowed_seq, seq);
+ }
+ 
+ static inline void set_mems_allowed(nodemask_t nodemask)
+@@ -225,14 +226,14 @@ static inline void set_mems_allowed(nodemask_t nodemask)
+ {
+ }
+ 
+-static inline unsigned int get_mems_allowed(void)
++static inline unsigned int read_mems_allowed_begin(void)
+ {
+ 	return 0;
+ }
+ 
+-static inline bool put_mems_allowed(unsigned int seq)
++static inline bool read_mems_allowed_retry(unsigned int seq)
+ {
+-	return true;
++	return false;
+ }
+ 
+ #endif /* !CONFIG_CPUSETS */
+diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
+index bd1e9bcec547..42b05c4c53e5 100644
+--- a/include/linux/hugetlb.h
++++ b/include/linux/hugetlb.h
+@@ -400,6 +400,16 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
+ 	return &mm->page_table_lock;
+ }
+ 
++static inline bool hugepages_supported(void)
++{
++	/*
++	 * Some platform decide whether they support huge pages at boot
++	 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
++	 * there is no such support
++	 */
++	return HPAGE_SHIFT != 0;
++}
++
+ #else	/* CONFIG_HUGETLB_PAGE */
+ struct hstate {};
+ #define alloc_huge_page_node(h, nid) NULL
+diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
+index 1f44466c1e9d..c367cbdf73ab 100644
+--- a/include/linux/jiffies.h
++++ b/include/linux/jiffies.h
+@@ -258,23 +258,11 @@ extern unsigned long preset_lpj;
+ #define SEC_JIFFIE_SC (32 - SHIFT_HZ)
+ #endif
+ #define NSEC_JIFFIE_SC (SEC_JIFFIE_SC + 29)
+-#define USEC_JIFFIE_SC (SEC_JIFFIE_SC + 19)
+ #define SEC_CONVERSION ((unsigned long)((((u64)NSEC_PER_SEC << SEC_JIFFIE_SC) +\
+                                 TICK_NSEC -1) / (u64)TICK_NSEC))
+ 
+ #define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\
+                                         TICK_NSEC -1) / (u64)TICK_NSEC))
+-#define USEC_CONVERSION  \
+-                    ((unsigned long)((((u64)NSEC_PER_USEC << USEC_JIFFIE_SC) +\
+-                                        TICK_NSEC -1) / (u64)TICK_NSEC))
+-/*
+- * USEC_ROUND is used in the timeval to jiffie conversion.  See there
+- * for more details.  It is the scaled resolution rounding value.  Note
+- * that it is a 64-bit value.  Since, when it is applied, we are already
+- * in jiffies (albit scaled), it is nothing but the bits we will shift
+- * off.
+- */
+-#define USEC_ROUND (u64)(((u64)1 << USEC_JIFFIE_SC) - 1)
+ /*
+  * The maximum jiffie value is (MAX_INT >> 1).  Here we translate that
+  * into seconds.  The 64-bit case will overflow if we are not careful,
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 290901a8c1de..2b58d192ea24 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -342,9 +342,9 @@ struct mm_rss_stat {
+ 
+ struct kioctx_table;
+ struct mm_struct {
+-	struct vm_area_struct * mmap;		/* list of VMAs */
++	struct vm_area_struct *mmap;		/* list of VMAs */
+ 	struct rb_root mm_rb;
+-	struct vm_area_struct * mmap_cache;	/* last find_vma result */
++	u32 vmacache_seqnum;                   /* per-thread vmacache */
+ #ifdef CONFIG_MMU
+ 	unsigned long (*get_unmapped_area) (struct file *filp,
+ 				unsigned long addr, unsigned long len,
+diff --git a/include/linux/plist.h b/include/linux/plist.h
+index aa0fb390bd29..8b6c970cff6c 100644
+--- a/include/linux/plist.h
++++ b/include/linux/plist.h
+@@ -98,6 +98,13 @@ struct plist_node {
+ }
+ 
+ /**
++ * PLIST_HEAD - declare and init plist_head
++ * @head:	name for struct plist_head variable
++ */
++#define PLIST_HEAD(head) \
++	struct plist_head head = PLIST_HEAD_INIT(head)
++
++/**
+  * PLIST_NODE_INIT - static struct plist_node initializer
+  * @node:	struct plist_node variable name
+  * @__prio:	initial node priority
+@@ -134,6 +141,8 @@ static inline void plist_node_init(struct plist_node *node, int prio)
+ extern void plist_add(struct plist_node *node, struct plist_head *head);
+ extern void plist_del(struct plist_node *node, struct plist_head *head);
+ 
++extern void plist_requeue(struct plist_node *node, struct plist_head *head);
++
+ /**
+  * plist_for_each - iterate over the plist
+  * @pos:	the type * to use as a loop counter
+@@ -143,6 +152,16 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
+ 	 list_for_each_entry(pos, &(head)->node_list, node_list)
+ 
+ /**
++ * plist_for_each_continue - continue iteration over the plist
++ * @pos:	the type * to use as a loop cursor
++ * @head:	the head for your list
++ *
++ * Continue to iterate over plist, continuing after the current position.
++ */
++#define plist_for_each_continue(pos, head)	\
++	 list_for_each_entry_continue(pos, &(head)->node_list, node_list)
++
++/**
+  * plist_for_each_safe - iterate safely over a plist of given type
+  * @pos:	the type * to use as a loop counter
+  * @n:	another type * to use as temporary storage
+@@ -163,6 +182,18 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
+ 	 list_for_each_entry(pos, &(head)->node_list, mem.node_list)
+ 
+ /**
++ * plist_for_each_entry_continue - continue iteration over list of given type
++ * @pos:	the type * to use as a loop cursor
++ * @head:	the head for your list
++ * @m:		the name of the list_struct within the struct
++ *
++ * Continue to iterate over list of given type, continuing after
++ * the current position.
++ */
++#define plist_for_each_entry_continue(pos, head, m)	\
++	list_for_each_entry_continue(pos, &(head)->node_list, m.node_list)
++
++/**
+  * plist_for_each_entry_safe - iterate safely over list of given type
+  * @pos:	the type * to use as a loop counter
+  * @n:		another type * to use as temporary storage
+@@ -229,6 +260,20 @@ static inline int plist_node_empty(const struct plist_node *node)
+ #endif
+ 
+ /**
++ * plist_next - get the next entry in list
++ * @pos:	the type * to cursor
++ */
++#define plist_next(pos) \
++	list_next_entry(pos, node_list)
++
++/**
++ * plist_prev - get the prev entry in list
++ * @pos:	the type * to cursor
++ */
++#define plist_prev(pos) \
++	list_prev_entry(pos, node_list)
++
++/**
+  * plist_first - return the first node (and thus, highest priority)
+  * @head:	the &struct plist_head pointer
+  *
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index ccd0c6f24f2c..d7ca410ace93 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -59,6 +59,10 @@ struct sched_param {
+ 
+ #define SCHED_ATTR_SIZE_VER0	48	/* sizeof first published struct */
+ 
++#define VMACACHE_BITS 2
++#define VMACACHE_SIZE (1U << VMACACHE_BITS)
++#define VMACACHE_MASK (VMACACHE_SIZE - 1)
++
+ /*
+  * Extended scheduling parameters data structure.
+  *
+@@ -1228,6 +1232,9 @@ struct task_struct {
+ #ifdef CONFIG_COMPAT_BRK
+ 	unsigned brk_randomized:1;
+ #endif
++	/* per-thread vma caching */
++	u32 vmacache_seqnum;
++	struct vm_area_struct *vmacache[VMACACHE_SIZE];
+ #if defined(SPLIT_RSS_COUNTING)
+ 	struct task_rss_stat	rss_stat;
+ #endif
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index 46ba0c6c219f..789324976801 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -214,8 +214,9 @@ struct percpu_cluster {
+ struct swap_info_struct {
+ 	unsigned long	flags;		/* SWP_USED etc: see above */
+ 	signed short	prio;		/* swap priority of this type */
++	struct plist_node list;		/* entry in swap_active_head */
++	struct plist_node avail_list;	/* entry in swap_avail_head */
+ 	signed char	type;		/* strange name for an index */
+-	signed char	next;		/* next type on the swap list */
+ 	unsigned int	max;		/* extent of the swap_map */
+ 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
+ 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
+@@ -255,11 +256,6 @@ struct swap_info_struct {
+ 	struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
+ };
+ 
+-struct swap_list_t {
+-	int head;	/* head of priority-ordered swapfile list */
+-	int next;	/* swapfile to be used next */
+-};
+-
+ /* linux/mm/page_alloc.c */
+ extern unsigned long totalram_pages;
+ extern unsigned long totalreserve_pages;
+diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
+index e282624e8c10..388293a91e8c 100644
+--- a/include/linux/swapfile.h
++++ b/include/linux/swapfile.h
+@@ -6,7 +6,7 @@
+  * want to expose them to the dozens of source files that include swap.h
+  */
+ extern spinlock_t swap_lock;
+-extern struct swap_list_t swap_list;
++extern struct plist_head swap_active_head;
+ extern struct swap_info_struct *swap_info[];
+ extern int try_to_unuse(unsigned int, bool, unsigned long);
+ 
+diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h
+new file mode 100644
+index 000000000000..c3fa0fd43949
+--- /dev/null
++++ b/include/linux/vmacache.h
+@@ -0,0 +1,38 @@
++#ifndef __LINUX_VMACACHE_H
++#define __LINUX_VMACACHE_H
++
++#include <linux/sched.h>
++#include <linux/mm.h>
++
++/*
++ * Hash based on the page number. Provides a good hit rate for
++ * workloads with good locality and those with random accesses as well.
++ */
++#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK)
++
++static inline void vmacache_flush(struct task_struct *tsk)
++{
++	memset(tsk->vmacache, 0, sizeof(tsk->vmacache));
++}
++
++extern void vmacache_flush_all(struct mm_struct *mm);
++extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma);
++extern struct vm_area_struct *vmacache_find(struct mm_struct *mm,
++						    unsigned long addr);
++
++#ifndef CONFIG_MMU
++extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
++						  unsigned long start,
++						  unsigned long end);
++#endif
++
++static inline void vmacache_invalidate(struct mm_struct *mm)
++{
++	mm->vmacache_seqnum++;
++
++	/* deal with overflows */
++	if (unlikely(mm->vmacache_seqnum == 0))
++		vmacache_flush_all(mm);
++}
++
++#endif /* __LINUX_VMACACHE_H */
+diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h
+index bef53ce555d2..b10682cb138c 100644
+--- a/include/media/videobuf2-core.h
++++ b/include/media/videobuf2-core.h
+@@ -329,6 +329,9 @@ struct v4l2_fh;
+  * @retry_start_streaming: start_streaming() was called, but there were not enough
+  *		buffers queued. If set, then retry calling start_streaming when
+  *		queuing a new buffer.
++ * @waiting_for_buffers: used in poll() to check if vb2 is still waiting for
++ *		buffers. Only set for capture queues if qbuf has not yet been
++ *		called since poll() needs to return POLLERR in that situation.
+  * @fileio:	file io emulator internal data, used only if emulator is active
+  */
+ struct vb2_queue {
+@@ -362,6 +365,7 @@ struct vb2_queue {
+ 
+ 	unsigned int			streaming:1;
+ 	unsigned int			retry_start_streaming:1;
++	unsigned int			waiting_for_buffers:1;
+ 
+ 	struct vb2_fileio_data		*fileio;
+ };
+diff --git a/init/Kconfig b/init/Kconfig
+index 93c5ef0c5210..8b9521a2d2c1 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -1389,6 +1389,7 @@ config FUTEX
+ 
+ config HAVE_FUTEX_CMPXCHG
+ 	bool
++	depends on FUTEX
+ 	help
+ 	  Architectures should select this if futex_atomic_cmpxchg_inatomic()
+ 	  is implemented and always working. This removes a couple of runtime
+diff --git a/kernel/cpuset.c b/kernel/cpuset.c
+index 6b27e5c0cd86..15b3ea693225 100644
+--- a/kernel/cpuset.c
++++ b/kernel/cpuset.c
+@@ -1022,7 +1022,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
+ 	task_lock(tsk);
+ 	/*
+ 	 * Determine if a loop is necessary if another thread is doing
+-	 * get_mems_allowed().  If at least one node remains unchanged and
++	 * read_mems_allowed_begin().  If at least one node remains unchanged and
+ 	 * tsk does not have a mempolicy, then an empty nodemask will not be
+ 	 * possible when mems_allowed is larger than a word.
+ 	 */
+diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
+index 334b3980ffc1..8865caec45fb 100644
+--- a/kernel/debug/debug_core.c
++++ b/kernel/debug/debug_core.c
+@@ -49,6 +49,7 @@
+ #include <linux/pid.h>
+ #include <linux/smp.h>
+ #include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/rcupdate.h>
+ 
+ #include <asm/cacheflush.h>
+@@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
+ 	if (!CACHE_FLUSH_IS_SAFE)
+ 		return;
+ 
+-	if (current->mm && current->mm->mmap_cache) {
+-		flush_cache_range(current->mm->mmap_cache,
+-				  addr, addr + BREAK_INSTR_SIZE);
++	if (current->mm) {
++		int i;
++
++		for (i = 0; i < VMACACHE_SIZE; i++) {
++			if (!current->vmacache[i])
++				continue;
++			flush_cache_range(current->vmacache[i],
++					  addr, addr + BREAK_INSTR_SIZE);
++		}
+ 	}
++
+ 	/* Force flush instruction cache if it was outside the mm */
+ 	flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
+ }
+diff --git a/kernel/events/core.c b/kernel/events/core.c
+index 3a140ca37777..4ced342f1ba9 100644
+--- a/kernel/events/core.c
++++ b/kernel/events/core.c
+@@ -7836,8 +7836,10 @@ int perf_event_init_task(struct task_struct *child)
+ 
+ 	for_each_task_context_nr(ctxn) {
+ 		ret = perf_event_init_context(child, ctxn);
+-		if (ret)
++		if (ret) {
++			perf_event_free_task(child);
+ 			return ret;
++		}
+ 	}
+ 
+ 	return 0;
+diff --git a/kernel/fork.c b/kernel/fork.c
+index c44bff8097f5..e2c685396295 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -28,6 +28,8 @@
+ #include <linux/mman.h>
+ #include <linux/mmu_notifier.h>
+ #include <linux/fs.h>
++#include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/nsproxy.h>
+ #include <linux/capability.h>
+ #include <linux/cpu.h>
+@@ -363,7 +365,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+ 
+ 	mm->locked_vm = 0;
+ 	mm->mmap = NULL;
+-	mm->mmap_cache = NULL;
++	mm->vmacache_seqnum = 0;
+ 	mm->map_count = 0;
+ 	cpumask_clear(mm_cpumask(mm));
+ 	mm->mm_rb = RB_ROOT;
+@@ -876,6 +878,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
+ 	if (!oldmm)
+ 		return 0;
+ 
++	/* initialize the new vmacache entries */
++	vmacache_flush(tsk);
++
+ 	if (clone_flags & CLONE_VM) {
+ 		atomic_inc(&oldmm->mm_users);
+ 		mm = oldmm;
+@@ -1323,7 +1328,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
+ 		goto bad_fork_cleanup_policy;
+ 	retval = audit_alloc(p);
+ 	if (retval)
+-		goto bad_fork_cleanup_policy;
++		goto bad_fork_cleanup_perf;
+ 	/* copy all the process information */
+ 	retval = copy_semundo(clone_flags, p);
+ 	if (retval)
+@@ -1522,8 +1527,9 @@ bad_fork_cleanup_semundo:
+ 	exit_sem(p);
+ bad_fork_cleanup_audit:
+ 	audit_free(p);
+-bad_fork_cleanup_policy:
++bad_fork_cleanup_perf:
+ 	perf_event_free_task(p);
++bad_fork_cleanup_policy:
+ #ifdef CONFIG_NUMA
+ 	mpol_put(p->mempolicy);
+ bad_fork_cleanup_cgroup:
+diff --git a/kernel/time.c b/kernel/time.c
+index 7c7964c33ae7..3c49ab45f822 100644
+--- a/kernel/time.c
++++ b/kernel/time.c
+@@ -496,17 +496,20 @@ EXPORT_SYMBOL(usecs_to_jiffies);
+  * that a remainder subtract here would not do the right thing as the
+  * resolution values don't fall on second boundries.  I.e. the line:
+  * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
++ * Note that due to the small error in the multiplier here, this
++ * rounding is incorrect for sufficiently large values of tv_nsec, but
++ * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're
++ * OK.
+  *
+  * Rather, we just shift the bits off the right.
+  *
+  * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
+  * value to a scaled second value.
+  */
+-unsigned long
+-timespec_to_jiffies(const struct timespec *value)
++static unsigned long
++__timespec_to_jiffies(unsigned long sec, long nsec)
+ {
+-	unsigned long sec = value->tv_sec;
+-	long nsec = value->tv_nsec + TICK_NSEC - 1;
++	nsec = nsec + TICK_NSEC - 1;
+ 
+ 	if (sec >= MAX_SEC_IN_JIFFIES){
+ 		sec = MAX_SEC_IN_JIFFIES;
+@@ -517,6 +520,13 @@ timespec_to_jiffies(const struct timespec *value)
+ 		 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
+ 
+ }
++
++unsigned long
++timespec_to_jiffies(const struct timespec *value)
++{
++	return __timespec_to_jiffies(value->tv_sec, value->tv_nsec);
++}
++
+ EXPORT_SYMBOL(timespec_to_jiffies);
+ 
+ void
+@@ -533,31 +543,27 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
+ }
+ EXPORT_SYMBOL(jiffies_to_timespec);
+ 
+-/* Same for "timeval"
++/*
++ * We could use a similar algorithm to timespec_to_jiffies (with a
++ * different multiplier for usec instead of nsec). But this has a
++ * problem with rounding: we can't exactly add TICK_NSEC - 1 to the
++ * usec value, since it's not necessarily integral.
+  *
+- * Well, almost.  The problem here is that the real system resolution is
+- * in nanoseconds and the value being converted is in micro seconds.
+- * Also for some machines (those that use HZ = 1024, in-particular),
+- * there is a LARGE error in the tick size in microseconds.
+-
+- * The solution we use is to do the rounding AFTER we convert the
+- * microsecond part.  Thus the USEC_ROUND, the bits to be shifted off.
+- * Instruction wise, this should cost only an additional add with carry
+- * instruction above the way it was done above.
++ * We could instead round in the intermediate scaled representation
++ * (i.e. in units of 1/2^(large scale) jiffies) but that's also
++ * perilous: the scaling introduces a small positive error, which
++ * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1
++ * units to the intermediate before shifting) leads to accidental
++ * overflow and overestimates.
++ *
++ * At the cost of one additional multiplication by a constant, just
++ * use the timespec implementation.
+  */
+ unsigned long
+ timeval_to_jiffies(const struct timeval *value)
+ {
+-	unsigned long sec = value->tv_sec;
+-	long usec = value->tv_usec;
+-
+-	if (sec >= MAX_SEC_IN_JIFFIES){
+-		sec = MAX_SEC_IN_JIFFIES;
+-		usec = 0;
+-	}
+-	return (((u64)sec * SEC_CONVERSION) +
+-		(((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
+-		 (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
++	return __timespec_to_jiffies(value->tv_sec,
++				     value->tv_usec * NSEC_PER_USEC);
+ }
+ EXPORT_SYMBOL(timeval_to_jiffies);
+ 
+diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
+index 773aba836e81..774a0807fe81 100644
+--- a/kernel/trace/ring_buffer.c
++++ b/kernel/trace/ring_buffer.c
+@@ -3372,7 +3372,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
+ 	iter->head = cpu_buffer->reader_page->read;
+ 
+ 	iter->cache_reader_page = iter->head_page;
+-	iter->cache_read = iter->head;
++	iter->cache_read = cpu_buffer->read;
+ 
+ 	if (iter->head)
+ 		iter->read_stamp = cpu_buffer->read_stamp;
+diff --git a/lib/plist.c b/lib/plist.c
+index 1ebc95f7a46f..0f2084d30798 100644
+--- a/lib/plist.c
++++ b/lib/plist.c
+@@ -134,6 +134,46 @@ void plist_del(struct plist_node *node, struct plist_head *head)
+ 	plist_check_head(head);
+ }
+ 
++/**
++ * plist_requeue - Requeue @node at end of same-prio entries.
++ *
++ * This is essentially an optimized plist_del() followed by
++ * plist_add().  It moves an entry already in the plist to
++ * after any other same-priority entries.
++ *
++ * @node:	&struct plist_node pointer - entry to be moved
++ * @head:	&struct plist_head pointer - list head
++ */
++void plist_requeue(struct plist_node *node, struct plist_head *head)
++{
++	struct plist_node *iter;
++	struct list_head *node_next = &head->node_list;
++
++	plist_check_head(head);
++	BUG_ON(plist_head_empty(head));
++	BUG_ON(plist_node_empty(node));
++
++	if (node == plist_last(head))
++		return;
++
++	iter = plist_next(node);
++
++	if (node->prio != iter->prio)
++		return;
++
++	plist_del(node, head);
++
++	plist_for_each_continue(iter, head) {
++		if (node->prio != iter->prio) {
++			node_next = &iter->node_list;
++			break;
++		}
++	}
++	list_add_tail(&node->node_list, node_next);
++
++	plist_check_head(head);
++}
++
+ #ifdef CONFIG_DEBUG_PI_LIST
+ #include <linux/sched.h>
+ #include <linux/module.h>
+@@ -170,6 +210,14 @@ static void __init plist_test_check(int nr_expect)
+ 	BUG_ON(prio_pos->prio_list.next != &first->prio_list);
+ }
+ 
++static void __init plist_test_requeue(struct plist_node *node)
++{
++	plist_requeue(node, &test_head);
++
++	if (node != plist_last(&test_head))
++		BUG_ON(node->prio == plist_next(node)->prio);
++}
++
+ static int  __init plist_test(void)
+ {
+ 	int nr_expect = 0, i, loop;
+@@ -193,6 +241,10 @@ static int  __init plist_test(void)
+ 			nr_expect--;
+ 		}
+ 		plist_test_check(nr_expect);
++		if (!plist_node_empty(test_node + i)) {
++			plist_test_requeue(test_node + i);
++			plist_test_check(nr_expect);
++		}
+ 	}
+ 
+ 	for (i = 0; i < ARRAY_SIZE(test_node); i++) {
+diff --git a/mm/Makefile b/mm/Makefile
+index 310c90a09264..c561f1f6bca0 100644
+--- a/mm/Makefile
++++ b/mm/Makefile
+@@ -16,7 +16,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
+ 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
+ 			   util.o mmzone.o vmstat.o backing-dev.o \
+ 			   mm_init.o mmu_context.o percpu.o slab_common.o \
+-			   compaction.o balloon_compaction.o \
++			   compaction.o balloon_compaction.o vmacache.o \
+ 			   interval_tree.o list_lru.o $(mmu-y)
+ 
+ obj-y += init-mm.o
+diff --git a/mm/compaction.c b/mm/compaction.c
+index 5f702ef0a65f..5e38e5706f62 100644
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -217,21 +217,12 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
+ /* Returns true if the page is within a block suitable for migration to */
+ static bool suitable_migration_target(struct page *page)
+ {
+-	int migratetype = get_pageblock_migratetype(page);
+-
+-	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
+-	if (migratetype == MIGRATE_RESERVE)
+-		return false;
+-
+-	if (is_migrate_isolate(migratetype))
+-		return false;
+-
+-	/* If the page is a large free page, then allow migration */
++	/* If the page is a large free page, then disallow migration */
+ 	if (PageBuddy(page) && page_order(page) >= pageblock_order)
+-		return true;
++		return false;
+ 
+ 	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
+-	if (migrate_async_suitable(migratetype))
++	if (migrate_async_suitable(get_pageblock_migratetype(page)))
+ 		return true;
+ 
+ 	/* Otherwise skip the block */
+@@ -253,6 +244,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
+ 	struct page *cursor, *valid_page = NULL;
+ 	unsigned long flags;
+ 	bool locked = false;
++	bool checked_pageblock = false;
+ 
+ 	cursor = pfn_to_page(blockpfn);
+ 
+@@ -284,8 +276,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
+ 			break;
+ 
+ 		/* Recheck this is a suitable migration target under lock */
+-		if (!strict && !suitable_migration_target(page))
+-			break;
++		if (!strict && !checked_pageblock) {
++			/*
++			 * We need to check suitability of pageblock only once
++			 * and this isolate_freepages_block() is called with
++			 * pageblock range, so just check once is sufficient.
++			 */
++			checked_pageblock = true;
++			if (!suitable_migration_target(page))
++				break;
++		}
+ 
+ 		/* Recheck this is a buddy page under lock */
+ 		if (!PageBuddy(page))
+@@ -460,12 +460,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ 	unsigned long last_pageblock_nr = 0, pageblock_nr;
+ 	unsigned long nr_scanned = 0, nr_isolated = 0;
+ 	struct list_head *migratelist = &cc->migratepages;
+-	isolate_mode_t mode = 0;
+ 	struct lruvec *lruvec;
+ 	unsigned long flags;
+ 	bool locked = false;
+ 	struct page *page = NULL, *valid_page = NULL;
+ 	bool skipped_async_unsuitable = false;
++	const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
++				    (unevictable ? ISOLATE_UNEVICTABLE : 0);
+ 
+ 	/*
+ 	 * Ensure that there are not too many pages isolated from the LRU
+@@ -487,7 +488,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ 	cond_resched();
+ 	for (; low_pfn < end_pfn; low_pfn++) {
+ 		/* give a chance to irqs before checking need_resched() */
+-		if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
++		if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
+ 			if (should_release_lock(&zone->lru_lock)) {
+ 				spin_unlock_irqrestore(&zone->lru_lock, flags);
+ 				locked = false;
+@@ -526,8 +527,25 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ 
+ 		/* If isolation recently failed, do not retry */
+ 		pageblock_nr = low_pfn >> pageblock_order;
+-		if (!isolation_suitable(cc, page))
+-			goto next_pageblock;
++		if (last_pageblock_nr != pageblock_nr) {
++			int mt;
++
++			last_pageblock_nr = pageblock_nr;
++			if (!isolation_suitable(cc, page))
++				goto next_pageblock;
++
++			/*
++			 * For async migration, also only scan in MOVABLE
++			 * blocks. Async migration is optimistic to see if
++			 * the minimum amount of work satisfies the allocation
++			 */
++			mt = get_pageblock_migratetype(page);
++			if (!cc->sync && !migrate_async_suitable(mt)) {
++				cc->finished_update_migrate = true;
++				skipped_async_unsuitable = true;
++				goto next_pageblock;
++			}
++		}
+ 
+ 		/*
+ 		 * Skip if free. page_order cannot be used without zone->lock
+@@ -537,18 +555,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ 			continue;
+ 
+ 		/*
+-		 * For async migration, also only scan in MOVABLE blocks. Async
+-		 * migration is optimistic to see if the minimum amount of work
+-		 * satisfies the allocation
+-		 */
+-		if (!cc->sync && last_pageblock_nr != pageblock_nr &&
+-		    !migrate_async_suitable(get_pageblock_migratetype(page))) {
+-			cc->finished_update_migrate = true;
+-			skipped_async_unsuitable = true;
+-			goto next_pageblock;
+-		}
+-
+-		/*
+ 		 * Check may be lockless but that's ok as we recheck later.
+ 		 * It's possible to migrate LRU pages and balloon pages
+ 		 * Skip any other type of page
+@@ -557,11 +563,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ 			if (unlikely(balloon_page_movable(page))) {
+ 				if (locked && balloon_page_isolate(page)) {
+ 					/* Successfully isolated */
+-					cc->finished_update_migrate = true;
+-					list_add(&page->lru, migratelist);
+-					cc->nr_migratepages++;
+-					nr_isolated++;
+-					goto check_compact_cluster;
++					goto isolate_success;
+ 				}
+ 			}
+ 			continue;
+@@ -584,6 +586,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ 			continue;
+ 		}
+ 
++		/*
++		 * Migration will fail if an anonymous page is pinned in memory,
++		 * so avoid taking lru_lock and isolating it unnecessarily in an
++		 * admittedly racy check.
++		 */
++		if (!page_mapping(page) &&
++		    page_count(page) > page_mapcount(page))
++			continue;
++
+ 		/* Check if it is ok to still hold the lock */
+ 		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
+ 								locked, cc);
+@@ -598,12 +609,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ 			continue;
+ 		}
+ 
+-		if (!cc->sync)
+-			mode |= ISOLATE_ASYNC_MIGRATE;
+-
+-		if (unevictable)
+-			mode |= ISOLATE_UNEVICTABLE;
+-
+ 		lruvec = mem_cgroup_page_lruvec(page, zone);
+ 
+ 		/* Try isolate the page */
+@@ -613,13 +618,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ 		VM_BUG_ON_PAGE(PageTransCompound(page), page);
+ 
+ 		/* Successfully isolated */
+-		cc->finished_update_migrate = true;
+ 		del_page_from_lru_list(page, lruvec, page_lru(page));
++
++isolate_success:
++		cc->finished_update_migrate = true;
+ 		list_add(&page->lru, migratelist);
+ 		cc->nr_migratepages++;
+ 		nr_isolated++;
+ 
+-check_compact_cluster:
+ 		/* Avoid isolating too much */
+ 		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
+ 			++low_pfn;
+@@ -630,7 +636,6 @@ check_compact_cluster:
+ 
+ next_pageblock:
+ 		low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
+-		last_pageblock_nr = pageblock_nr;
+ 	}
+ 
+ 	acct_isolated(zone, locked, cc);
+@@ -1188,6 +1193,7 @@ static void compact_node(int nid)
+ 	struct compact_control cc = {
+ 		.order = -1,
+ 		.sync = true,
++		.ignore_skip_hint = true,
+ 	};
+ 
+ 	__compact_pgdat(NODE_DATA(nid), &cc);
+diff --git a/mm/filemap.c b/mm/filemap.c
+index 7a13f6ac5421..c2cc7c95eff1 100644
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -192,9 +192,11 @@ static int filemap_check_errors(struct address_space *mapping)
+ {
+ 	int ret = 0;
+ 	/* Check for outstanding write errors */
+-	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
++	if (test_bit(AS_ENOSPC, &mapping->flags) &&
++	    test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+ 		ret = -ENOSPC;
+-	if (test_and_clear_bit(AS_EIO, &mapping->flags))
++	if (test_bit(AS_EIO, &mapping->flags) &&
++	    test_and_clear_bit(AS_EIO, &mapping->flags))
+ 		ret = -EIO;
+ 	return ret;
+ }
+@@ -520,10 +522,10 @@ struct page *__page_cache_alloc(gfp_t gfp)
+ 	if (cpuset_do_page_mem_spread()) {
+ 		unsigned int cpuset_mems_cookie;
+ 		do {
+-			cpuset_mems_cookie = get_mems_allowed();
++			cpuset_mems_cookie = read_mems_allowed_begin();
+ 			n = cpuset_mem_spread_node();
+ 			page = alloc_pages_exact_node(n, gfp, 0);
+-		} while (!put_mems_allowed(cpuset_mems_cookie) && !page);
++		} while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
+ 
+ 		return page;
+ 	}
+diff --git a/mm/frontswap.c b/mm/frontswap.c
+index 1b24bdcb3197..c30eec536f03 100644
+--- a/mm/frontswap.c
++++ b/mm/frontswap.c
+@@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area);
+ 
+ static unsigned long __frontswap_curr_pages(void)
+ {
+-	int type;
+ 	unsigned long totalpages = 0;
+ 	struct swap_info_struct *si = NULL;
+ 
+ 	assert_spin_locked(&swap_lock);
+-	for (type = swap_list.head; type >= 0; type = si->next) {
+-		si = swap_info[type];
++	plist_for_each_entry(si, &swap_active_head, list)
+ 		totalpages += atomic_read(&si->frontswap_pages);
+-	}
+ 	return totalpages;
+ }
+ 
+@@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
+ 	int si_frontswap_pages;
+ 	unsigned long total_pages_to_unuse = total;
+ 	unsigned long pages = 0, pages_to_unuse = 0;
+-	int type;
+ 
+ 	assert_spin_locked(&swap_lock);
+-	for (type = swap_list.head; type >= 0; type = si->next) {
+-		si = swap_info[type];
++	plist_for_each_entry(si, &swap_active_head, list) {
+ 		si_frontswap_pages = atomic_read(&si->frontswap_pages);
+ 		if (total_pages_to_unuse < si_frontswap_pages) {
+ 			pages = pages_to_unuse = total_pages_to_unuse;
+@@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
+ 		}
+ 		vm_unacct_memory(pages);
+ 		*unused = pages_to_unuse;
+-		*swapid = type;
++		*swapid = si->type;
+ 		ret = 0;
+ 		break;
+ 	}
+@@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages)
+ 	/*
+ 	 * we don't want to hold swap_lock while doing a very
+ 	 * lengthy try_to_unuse, but swap_list may change
+-	 * so restart scan from swap_list.head each time
++	 * so restart scan from swap_active_head each time
+ 	 */
+ 	spin_lock(&swap_lock);
+ 	ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
+diff --git a/mm/huge_memory.c b/mm/huge_memory.c
+index 1c42d0c36d0b..718bfa16a36f 100644
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1819,21 +1819,24 @@ static int __split_huge_page_map(struct page *page,
+ 	if (pmd) {
+ 		pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ 		pmd_populate(mm, &_pmd, pgtable);
++		if (pmd_write(*pmd))
++			BUG_ON(page_mapcount(page) != 1);
+ 
+ 		haddr = address;
+ 		for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ 			pte_t *pte, entry;
+ 			BUG_ON(PageCompound(page+i));
++			/*
++			 * Note that pmd_numa is not transferred deliberately
++			 * to avoid any possibility that pte_numa leaks to
++			 * a PROT_NONE VMA by accident.
++			 */
+ 			entry = mk_pte(page + i, vma->vm_page_prot);
+ 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ 			if (!pmd_write(*pmd))
+ 				entry = pte_wrprotect(entry);
+-			else
+-				BUG_ON(page_mapcount(page) != 1);
+ 			if (!pmd_young(*pmd))
+ 				entry = pte_mkold(entry);
+-			if (pmd_numa(*pmd))
+-				entry = pte_mknuma(entry);
+ 			pte = pte_offset_map(&_pmd, haddr);
+ 			BUG_ON(!pte_none(*pte));
+ 			set_pte_at(mm, haddr, pte, entry);
+diff --git a/mm/hugetlb.c b/mm/hugetlb.c
+index 923f38e62bcf..67d0c175efcf 100644
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -540,7 +540,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
+ 		goto err;
+ 
+ retry_cpuset:
+-	cpuset_mems_cookie = get_mems_allowed();
++	cpuset_mems_cookie = read_mems_allowed_begin();
+ 	zonelist = huge_zonelist(vma, address,
+ 					htlb_alloc_mask(h), &mpol, &nodemask);
+ 
+@@ -562,7 +562,7 @@ retry_cpuset:
+ 	}
+ 
+ 	mpol_cond_put(mpol);
+-	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
++	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ 		goto retry_cpuset;
+ 	return page;
+ 
+@@ -2071,6 +2071,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
+ 	unsigned long tmp;
+ 	int ret;
+ 
++	if (!hugepages_supported())
++		return -ENOTSUPP;
++
+ 	tmp = h->max_huge_pages;
+ 
+ 	if (write && h->order >= MAX_ORDER)
+@@ -2124,6 +2127,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
+ 	unsigned long tmp;
+ 	int ret;
+ 
++	if (!hugepages_supported())
++		return -ENOTSUPP;
++
+ 	tmp = h->nr_overcommit_huge_pages;
+ 
+ 	if (write && h->order >= MAX_ORDER)
+@@ -2149,6 +2155,8 @@ out:
+ void hugetlb_report_meminfo(struct seq_file *m)
+ {
+ 	struct hstate *h = &default_hstate;
++	if (!hugepages_supported())
++		return;
+ 	seq_printf(m,
+ 			"HugePages_Total:   %5lu\n"
+ 			"HugePages_Free:    %5lu\n"
+@@ -2165,6 +2173,8 @@ void hugetlb_report_meminfo(struct seq_file *m)
+ int hugetlb_report_node_meminfo(int nid, char *buf)
+ {
+ 	struct hstate *h = &default_hstate;
++	if (!hugepages_supported())
++		return 0;
+ 	return sprintf(buf,
+ 		"Node %d HugePages_Total: %5u\n"
+ 		"Node %d HugePages_Free:  %5u\n"
+@@ -2179,6 +2189,9 @@ void hugetlb_show_meminfo(void)
+ 	struct hstate *h;
+ 	int nid;
+ 
++	if (!hugepages_supported())
++		return;
++
+ 	for_each_node_state(nid, N_MEMORY)
+ 		for_each_hstate(h)
+ 			pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
+diff --git a/mm/mempolicy.c b/mm/mempolicy.c
+index 15a8ea031526..796c7e6cf93b 100644
+--- a/mm/mempolicy.c
++++ b/mm/mempolicy.c
+@@ -1897,7 +1897,7 @@ int node_random(const nodemask_t *maskp)
+  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
+  * @nodemask for filtering the zonelist.
+  *
+- * Must be protected by get_mems_allowed()
++ * Must be protected by read_mems_allowed_begin()
+  */
+ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
+ 				gfp_t gfp_flags, struct mempolicy **mpol,
+@@ -2061,7 +2061,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
+ 
+ retry_cpuset:
+ 	pol = get_vma_policy(current, vma, addr);
+-	cpuset_mems_cookie = get_mems_allowed();
++	cpuset_mems_cookie = read_mems_allowed_begin();
+ 
+ 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
+ 		unsigned nid;
+@@ -2069,7 +2069,7 @@ retry_cpuset:
+ 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
+ 		mpol_cond_put(pol);
+ 		page = alloc_page_interleave(gfp, order, nid);
+-		if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
++		if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ 			goto retry_cpuset;
+ 
+ 		return page;
+@@ -2079,7 +2079,7 @@ retry_cpuset:
+ 				      policy_nodemask(gfp, pol));
+ 	if (unlikely(mpol_needs_cond_ref(pol)))
+ 		__mpol_put(pol);
+-	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
++	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ 		goto retry_cpuset;
+ 	return page;
+ }
+@@ -2113,7 +2113,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
+ 		pol = &default_policy;
+ 
+ retry_cpuset:
+-	cpuset_mems_cookie = get_mems_allowed();
++	cpuset_mems_cookie = read_mems_allowed_begin();
+ 
+ 	/*
+ 	 * No reference counting needed for current->mempolicy
+@@ -2126,7 +2126,7 @@ retry_cpuset:
+ 				policy_zonelist(gfp, pol, numa_node_id()),
+ 				policy_nodemask(gfp, pol));
+ 
+-	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
++	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ 		goto retry_cpuset;
+ 
+ 	return page;
+diff --git a/mm/migrate.c b/mm/migrate.c
+index bed48809e5d0..13f47fbe3550 100644
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -148,8 +148,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
+ 	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
+ 	if (pte_swp_soft_dirty(*ptep))
+ 		pte = pte_mksoft_dirty(pte);
++
++	/* Recheck VMA as permissions can change since migration started  */
+ 	if (is_write_migration_entry(entry))
+-		pte = pte_mkwrite(pte);
++		pte = maybe_mkwrite(pte, vma);
++
+ #ifdef CONFIG_HUGETLB_PAGE
+ 	if (PageHuge(new)) {
+ 		pte = pte_mkhuge(pte);
+diff --git a/mm/mmap.c b/mm/mmap.c
+index 20ff0c33274c..dfe90657a6db 100644
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -10,6 +10,7 @@
+ #include <linux/slab.h>
+ #include <linux/backing-dev.h>
+ #include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/shm.h>
+ #include <linux/mman.h>
+ #include <linux/pagemap.h>
+@@ -681,8 +682,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	prev->vm_next = next = vma->vm_next;
+ 	if (next)
+ 		next->vm_prev = prev;
+-	if (mm->mmap_cache == vma)
+-		mm->mmap_cache = prev;
++
++	/* Kill the cache */
++	vmacache_invalidate(mm);
+ }
+ 
+ /*
+@@ -1989,34 +1991,33 @@ EXPORT_SYMBOL(get_unmapped_area);
+ /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
+ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+ {
+-	struct vm_area_struct *vma = NULL;
++	struct rb_node *rb_node;
++	struct vm_area_struct *vma;
+ 
+ 	/* Check the cache first. */
+-	/* (Cache hit rate is typically around 35%.) */
+-	vma = ACCESS_ONCE(mm->mmap_cache);
+-	if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+-		struct rb_node *rb_node;
++	vma = vmacache_find(mm, addr);
++	if (likely(vma))
++		return vma;
+ 
+-		rb_node = mm->mm_rb.rb_node;
+-		vma = NULL;
++	rb_node = mm->mm_rb.rb_node;
++	vma = NULL;
+ 
+-		while (rb_node) {
+-			struct vm_area_struct *vma_tmp;
+-
+-			vma_tmp = rb_entry(rb_node,
+-					   struct vm_area_struct, vm_rb);
+-
+-			if (vma_tmp->vm_end > addr) {
+-				vma = vma_tmp;
+-				if (vma_tmp->vm_start <= addr)
+-					break;
+-				rb_node = rb_node->rb_left;
+-			} else
+-				rb_node = rb_node->rb_right;
+-		}
+-		if (vma)
+-			mm->mmap_cache = vma;
++	while (rb_node) {
++		struct vm_area_struct *tmp;
++
++		tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
++
++		if (tmp->vm_end > addr) {
++			vma = tmp;
++			if (tmp->vm_start <= addr)
++				break;
++			rb_node = rb_node->rb_left;
++		} else
++			rb_node = rb_node->rb_right;
+ 	}
++
++	if (vma)
++		vmacache_update(addr, vma);
+ 	return vma;
+ }
+ 
+@@ -2388,7 +2389,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	} else
+ 		mm->highest_vm_end = prev ? prev->vm_end : 0;
+ 	tail_vma->vm_next = NULL;
+-	mm->mmap_cache = NULL;		/* Kill the cache. */
++
++	/* Kill the cache */
++	vmacache_invalidate(mm);
+ }
+ 
+ /*
+diff --git a/mm/nommu.c b/mm/nommu.c
+index 8740213b1647..3ee4f74fbfbe 100644
+--- a/mm/nommu.c
++++ b/mm/nommu.c
+@@ -15,6 +15,7 @@
+ 
+ #include <linux/export.h>
+ #include <linux/mm.h>
++#include <linux/vmacache.h>
+ #include <linux/mman.h>
+ #include <linux/swap.h>
+ #include <linux/file.h>
+@@ -768,16 +769,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
+  */
+ static void delete_vma_from_mm(struct vm_area_struct *vma)
+ {
++	int i;
+ 	struct address_space *mapping;
+ 	struct mm_struct *mm = vma->vm_mm;
++	struct task_struct *curr = current;
+ 
+ 	kenter("%p", vma);
+ 
+ 	protect_vma(vma, 0);
+ 
+ 	mm->map_count--;
+-	if (mm->mmap_cache == vma)
+-		mm->mmap_cache = NULL;
++	for (i = 0; i < VMACACHE_SIZE; i++) {
++		/* if the vma is cached, invalidate the entire cache */
++		if (curr->vmacache[i] == vma) {
++			vmacache_invalidate(curr->mm);
++			break;
++		}
++	}
+ 
+ 	/* remove the VMA from the mapping */
+ 	if (vma->vm_file) {
+@@ -825,8 +833,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+ 	struct vm_area_struct *vma;
+ 
+ 	/* check the cache first */
+-	vma = ACCESS_ONCE(mm->mmap_cache);
+-	if (vma && vma->vm_start <= addr && vma->vm_end > addr)
++	vma = vmacache_find(mm, addr);
++	if (likely(vma))
+ 		return vma;
+ 
+ 	/* trawl the list (there may be multiple mappings in which addr
+@@ -835,7 +843,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+ 		if (vma->vm_start > addr)
+ 			return NULL;
+ 		if (vma->vm_end > addr) {
+-			mm->mmap_cache = vma;
++			vmacache_update(addr, vma);
+ 			return vma;
+ 		}
+ 	}
+@@ -874,8 +882,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
+ 	unsigned long end = addr + len;
+ 
+ 	/* check the cache first */
+-	vma = mm->mmap_cache;
+-	if (vma && vma->vm_start == addr && vma->vm_end == end)
++	vma = vmacache_find_exact(mm, addr, end);
++	if (vma)
+ 		return vma;
+ 
+ 	/* trawl the list (there may be multiple mappings in which addr
+@@ -886,7 +894,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
+ 		if (vma->vm_start > addr)
+ 			return NULL;
+ 		if (vma->vm_end == end) {
+-			mm->mmap_cache = vma;
++			vmacache_update(addr, vma);
+ 			return vma;
+ 		}
+ 	}
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 62e400d00e3f..ff0f6b13f32f 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1869,7 +1869,7 @@ static void __paginginit init_zone_allows_reclaim(int nid)
+ {
+ 	int i;
+ 
+-	for_each_online_node(i)
++	for_each_node_state(i, N_MEMORY)
+ 		if (node_distance(nid, i) <= RECLAIM_DISTANCE)
+ 			node_set(i, NODE_DATA(nid)->reclaim_nodes);
+ 		else
+@@ -2736,7 +2736,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+ 		return NULL;
+ 
+ retry_cpuset:
+-	cpuset_mems_cookie = get_mems_allowed();
++	cpuset_mems_cookie = read_mems_allowed_begin();
+ 
+ 	/* The preferred zone is used for statistics later */
+ 	first_zones_zonelist(zonelist, high_zoneidx,
+@@ -2791,7 +2791,7 @@ out:
+ 	 * the mask is being updated. If a page allocation is about to fail,
+ 	 * check if the cpuset changed during allocation and if so, retry.
+ 	 */
+-	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
++	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ 		goto retry_cpuset;
+ 
+ 	memcg_kmem_commit_charge(page, memcg, order);
+@@ -3059,9 +3059,9 @@ bool skip_free_areas_node(unsigned int flags, int nid)
+ 		goto out;
+ 
+ 	do {
+-		cpuset_mems_cookie = get_mems_allowed();
++		cpuset_mems_cookie = read_mems_allowed_begin();
+ 		ret = !node_isset(nid, cpuset_current_mems_allowed);
+-	} while (!put_mems_allowed(cpuset_mems_cookie));
++	} while (read_mems_allowed_retry(cpuset_mems_cookie));
+ out:
+ 	return ret;
+ }
+@@ -4933,7 +4933,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
+ 
+ 	pgdat->node_id = nid;
+ 	pgdat->node_start_pfn = node_start_pfn;
+-	init_zone_allows_reclaim(nid);
++	if (node_state(nid, N_MEMORY))
++		init_zone_allows_reclaim(nid);
+ #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ 	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+ #endif
+diff --git a/mm/readahead.c b/mm/readahead.c
+index 0de2360d65f3..1fa0d6fca556 100644
+--- a/mm/readahead.c
++++ b/mm/readahead.c
+@@ -233,14 +233,14 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
+ 	return 0;
+ }
+ 
++#define MAX_READAHEAD   ((512*4096)/PAGE_CACHE_SIZE)
+ /*
+  * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
+  * sensible upper limit.
+  */
+ unsigned long max_sane_readahead(unsigned long nr)
+ {
+-	return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE)
+-		+ node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
++	return min(nr, MAX_READAHEAD);
+ }
+ 
+ /*
+diff --git a/mm/slab.c b/mm/slab.c
+index ea854eb2388c..0b1c2a58559d 100644
+--- a/mm/slab.c
++++ b/mm/slab.c
+@@ -3122,7 +3122,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
+ 	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
+ 
+ retry_cpuset:
+-	cpuset_mems_cookie = get_mems_allowed();
++	cpuset_mems_cookie = read_mems_allowed_begin();
+ 	zonelist = node_zonelist(slab_node(), flags);
+ 
+ retry:
+@@ -3180,7 +3180,7 @@ retry:
+ 		}
+ 	}
+ 
+-	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
++	if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
+ 		goto retry_cpuset;
+ 	return obj;
+ }
+diff --git a/mm/slub.c b/mm/slub.c
+index 25f14ad8f817..7611f148ee81 100644
+--- a/mm/slub.c
++++ b/mm/slub.c
+@@ -1684,7 +1684,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
+ 		return NULL;
+ 
+ 	do {
+-		cpuset_mems_cookie = get_mems_allowed();
++		cpuset_mems_cookie = read_mems_allowed_begin();
+ 		zonelist = node_zonelist(slab_node(), flags);
+ 		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ 			struct kmem_cache_node *n;
+@@ -1696,19 +1696,17 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
+ 				object = get_partial_node(s, n, c, flags);
+ 				if (object) {
+ 					/*
+-					 * Return the object even if
+-					 * put_mems_allowed indicated that
+-					 * the cpuset mems_allowed was
+-					 * updated in parallel. It's a
+-					 * harmless race between the alloc
+-					 * and the cpuset update.
++					 * Don't check read_mems_allowed_retry()
++					 * here - if mems_allowed was updated in
++					 * parallel, that was a harmless race
++					 * between allocation and the cpuset
++					 * update
+ 					 */
+-					put_mems_allowed(cpuset_mems_cookie);
+ 					return object;
+ 				}
+ 			}
+ 		}
+-	} while (!put_mems_allowed(cpuset_mems_cookie));
++	} while (read_mems_allowed_retry(cpuset_mems_cookie));
+ #endif
+ 	return NULL;
+ }
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 4a7f7e6992b6..beeeef8a1b2d 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -51,14 +51,32 @@ atomic_long_t nr_swap_pages;
+ /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
+ long total_swap_pages;
+ static int least_priority;
+-static atomic_t highest_priority_index = ATOMIC_INIT(-1);
+ 
+ static const char Bad_file[] = "Bad swap file entry ";
+ static const char Unused_file[] = "Unused swap file entry ";
+ static const char Bad_offset[] = "Bad swap offset entry ";
+ static const char Unused_offset[] = "Unused swap offset entry ";
+ 
+-struct swap_list_t swap_list = {-1, -1};
++/*
++ * all active swap_info_structs
++ * protected with swap_lock, and ordered by priority.
++ */
++PLIST_HEAD(swap_active_head);
++
++/*
++ * all available (active, not full) swap_info_structs
++ * protected with swap_avail_lock, ordered by priority.
++ * This is used by get_swap_page() instead of swap_active_head
++ * because swap_active_head includes all swap_info_structs,
++ * but get_swap_page() doesn't need to look at full ones.
++ * This uses its own lock instead of swap_lock because when a
++ * swap_info_struct changes between not-full/full, it needs to
++ * add/remove itself to/from this list, but the swap_info_struct->lock
++ * is held and the locking order requires swap_lock to be taken
++ * before any swap_info_struct->lock.
++ */
++static PLIST_HEAD(swap_avail_head);
++static DEFINE_SPINLOCK(swap_avail_lock);
+ 
+ struct swap_info_struct *swap_info[MAX_SWAPFILES];
+ 
+@@ -591,6 +609,9 @@ checks:
+ 	if (si->inuse_pages == si->pages) {
+ 		si->lowest_bit = si->max;
+ 		si->highest_bit = 0;
++		spin_lock(&swap_avail_lock);
++		plist_del(&si->avail_list, &swap_avail_head);
++		spin_unlock(&swap_avail_lock);
+ 	}
+ 	si->swap_map[offset] = usage;
+ 	inc_cluster_info_page(si, si->cluster_info, offset);
+@@ -640,71 +661,65 @@ no_page:
+ 
+ swp_entry_t get_swap_page(void)
+ {
+-	struct swap_info_struct *si;
++	struct swap_info_struct *si, *next;
+ 	pgoff_t offset;
+-	int type, next;
+-	int wrapped = 0;
+-	int hp_index;
+ 
+-	spin_lock(&swap_lock);
+ 	if (atomic_long_read(&nr_swap_pages) <= 0)
+ 		goto noswap;
+ 	atomic_long_dec(&nr_swap_pages);
+ 
+-	for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
+-		hp_index = atomic_xchg(&highest_priority_index, -1);
+-		/*
+-		 * highest_priority_index records current highest priority swap
+-		 * type which just frees swap entries. If its priority is
+-		 * higher than that of swap_list.next swap type, we use it.  It
+-		 * isn't protected by swap_lock, so it can be an invalid value
+-		 * if the corresponding swap type is swapoff. We double check
+-		 * the flags here. It's even possible the swap type is swapoff
+-		 * and swapon again and its priority is changed. In such rare
+-		 * case, low prority swap type might be used, but eventually
+-		 * high priority swap will be used after several rounds of
+-		 * swap.
+-		 */
+-		if (hp_index != -1 && hp_index != type &&
+-		    swap_info[type]->prio < swap_info[hp_index]->prio &&
+-		    (swap_info[hp_index]->flags & SWP_WRITEOK)) {
+-			type = hp_index;
+-			swap_list.next = type;
+-		}
+-
+-		si = swap_info[type];
+-		next = si->next;
+-		if (next < 0 ||
+-		    (!wrapped && si->prio != swap_info[next]->prio)) {
+-			next = swap_list.head;
+-			wrapped++;
+-		}
++	spin_lock(&swap_avail_lock);
+ 
++start_over:
++	plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
++		/* requeue si to after same-priority siblings */
++		plist_requeue(&si->avail_list, &swap_avail_head);
++		spin_unlock(&swap_avail_lock);
+ 		spin_lock(&si->lock);
+-		if (!si->highest_bit) {
++		if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
++			spin_lock(&swap_avail_lock);
++			if (plist_node_empty(&si->avail_list)) {
++				spin_unlock(&si->lock);
++				goto nextsi;
++			}
++			WARN(!si->highest_bit,
++			     "swap_info %d in list but !highest_bit\n",
++			     si->type);
++			WARN(!(si->flags & SWP_WRITEOK),
++			     "swap_info %d in list but !SWP_WRITEOK\n",
++			     si->type);
++			plist_del(&si->avail_list, &swap_avail_head);
+ 			spin_unlock(&si->lock);
+-			continue;
++			goto nextsi;
+ 		}
+-		if (!(si->flags & SWP_WRITEOK)) {
+-			spin_unlock(&si->lock);
+-			continue;
+-		}
+-
+-		swap_list.next = next;
+ 
+-		spin_unlock(&swap_lock);
+ 		/* This is called for allocating swap entry for cache */
+ 		offset = scan_swap_map(si, SWAP_HAS_CACHE);
+ 		spin_unlock(&si->lock);
+ 		if (offset)
+-			return swp_entry(type, offset);
+-		spin_lock(&swap_lock);
+-		next = swap_list.next;
++			return swp_entry(si->type, offset);
++		pr_debug("scan_swap_map of si %d failed to find offset\n",
++		       si->type);
++		spin_lock(&swap_avail_lock);
++nextsi:
++		/*
++		 * if we got here, it's likely that si was almost full before,
++		 * and since scan_swap_map() can drop the si->lock, multiple
++		 * callers probably all tried to get a page from the same si
++		 * and it filled up before we could get one; or, the si filled
++		 * up between us dropping swap_avail_lock and taking si->lock.
++		 * Since we dropped the swap_avail_lock, the swap_avail_head
++		 * list may have been modified; so if next is still in the
++		 * swap_avail_head list then try it, otherwise start over.
++		 */
++		if (plist_node_empty(&next->avail_list))
++			goto start_over;
+ 	}
+ 
++	spin_unlock(&swap_avail_lock);
++
+ 	atomic_long_inc(&nr_swap_pages);
+ noswap:
+-	spin_unlock(&swap_lock);
+ 	return (swp_entry_t) {0};
+ }
+ 
+@@ -766,27 +781,6 @@ out:
+ 	return NULL;
+ }
+ 
+-/*
+- * This swap type frees swap entry, check if it is the highest priority swap
+- * type which just frees swap entry. get_swap_page() uses
+- * highest_priority_index to search highest priority swap type. The
+- * swap_info_struct.lock can't protect us if there are multiple swap types
+- * active, so we use atomic_cmpxchg.
+- */
+-static void set_highest_priority_index(int type)
+-{
+-	int old_hp_index, new_hp_index;
+-
+-	do {
+-		old_hp_index = atomic_read(&highest_priority_index);
+-		if (old_hp_index != -1 &&
+-			swap_info[old_hp_index]->prio >= swap_info[type]->prio)
+-			break;
+-		new_hp_index = type;
+-	} while (atomic_cmpxchg(&highest_priority_index,
+-		old_hp_index, new_hp_index) != old_hp_index);
+-}
+-
+ static unsigned char swap_entry_free(struct swap_info_struct *p,
+ 				     swp_entry_t entry, unsigned char usage)
+ {
+@@ -828,9 +822,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
+ 		dec_cluster_info_page(p, p->cluster_info, offset);
+ 		if (offset < p->lowest_bit)
+ 			p->lowest_bit = offset;
+-		if (offset > p->highest_bit)
++		if (offset > p->highest_bit) {
++			bool was_full = !p->highest_bit;
+ 			p->highest_bit = offset;
+-		set_highest_priority_index(p->type);
++			if (was_full && (p->flags & SWP_WRITEOK)) {
++				spin_lock(&swap_avail_lock);
++				WARN_ON(!plist_node_empty(&p->avail_list));
++				if (plist_node_empty(&p->avail_list))
++					plist_add(&p->avail_list,
++						  &swap_avail_head);
++				spin_unlock(&swap_avail_lock);
++			}
++		}
+ 		atomic_long_inc(&nr_swap_pages);
+ 		p->inuse_pages--;
+ 		frontswap_invalidate_page(p->type, offset);
+@@ -1765,30 +1768,37 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
+ 				unsigned char *swap_map,
+ 				struct swap_cluster_info *cluster_info)
+ {
+-	int i, prev;
+-
+ 	if (prio >= 0)
+ 		p->prio = prio;
+ 	else
+ 		p->prio = --least_priority;
++	/*
++	 * the plist prio is negated because plist ordering is
++	 * low-to-high, while swap ordering is high-to-low
++	 */
++	p->list.prio = -p->prio;
++	p->avail_list.prio = -p->prio;
+ 	p->swap_map = swap_map;
+ 	p->cluster_info = cluster_info;
+ 	p->flags |= SWP_WRITEOK;
+ 	atomic_long_add(p->pages, &nr_swap_pages);
+ 	total_swap_pages += p->pages;
+ 
+-	/* insert swap space into swap_list: */
+-	prev = -1;
+-	for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
+-		if (p->prio >= swap_info[i]->prio)
+-			break;
+-		prev = i;
+-	}
+-	p->next = i;
+-	if (prev < 0)
+-		swap_list.head = swap_list.next = p->type;
+-	else
+-		swap_info[prev]->next = p->type;
++	assert_spin_locked(&swap_lock);
++	/*
++	 * both lists are plists, and thus priority ordered.
++	 * swap_active_head needs to be priority ordered for swapoff(),
++	 * which on removal of any swap_info_struct with an auto-assigned
++	 * (i.e. negative) priority increments the auto-assigned priority
++	 * of any lower-priority swap_info_structs.
++	 * swap_avail_head needs to be priority ordered for get_swap_page(),
++	 * which allocates swap pages from the highest available priority
++	 * swap_info_struct.
++	 */
++	plist_add(&p->list, &swap_active_head);
++	spin_lock(&swap_avail_lock);
++	plist_add(&p->avail_list, &swap_avail_head);
++	spin_unlock(&swap_avail_lock);
+ }
+ 
+ static void enable_swap_info(struct swap_info_struct *p, int prio,
+@@ -1823,8 +1833,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ 	struct address_space *mapping;
+ 	struct inode *inode;
+ 	struct filename *pathname;
+-	int i, type, prev;
+-	int err;
++	int err, found = 0;
+ 	unsigned int old_block_size;
+ 
+ 	if (!capable(CAP_SYS_ADMIN))
+@@ -1842,17 +1851,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ 		goto out;
+ 
+ 	mapping = victim->f_mapping;
+-	prev = -1;
+ 	spin_lock(&swap_lock);
+-	for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
+-		p = swap_info[type];
++	plist_for_each_entry(p, &swap_active_head, list) {
+ 		if (p->flags & SWP_WRITEOK) {
+-			if (p->swap_file->f_mapping == mapping)
++			if (p->swap_file->f_mapping == mapping) {
++				found = 1;
+ 				break;
++			}
+ 		}
+-		prev = type;
+ 	}
+-	if (type < 0) {
++	if (!found) {
+ 		err = -EINVAL;
+ 		spin_unlock(&swap_lock);
+ 		goto out_dput;
+@@ -1864,20 +1872,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ 		spin_unlock(&swap_lock);
+ 		goto out_dput;
+ 	}
+-	if (prev < 0)
+-		swap_list.head = p->next;
+-	else
+-		swap_info[prev]->next = p->next;
+-	if (type == swap_list.next) {
+-		/* just pick something that's safe... */
+-		swap_list.next = swap_list.head;
+-	}
++	spin_lock(&swap_avail_lock);
++	plist_del(&p->avail_list, &swap_avail_head);
++	spin_unlock(&swap_avail_lock);
+ 	spin_lock(&p->lock);
+ 	if (p->prio < 0) {
+-		for (i = p->next; i >= 0; i = swap_info[i]->next)
+-			swap_info[i]->prio = p->prio--;
++		struct swap_info_struct *si = p;
++
++		plist_for_each_entry_continue(si, &swap_active_head, list) {
++			si->prio++;
++			si->list.prio--;
++			si->avail_list.prio--;
++		}
+ 		least_priority++;
+ 	}
++	plist_del(&p->list, &swap_active_head);
+ 	atomic_long_sub(p->pages, &nr_swap_pages);
+ 	total_swap_pages -= p->pages;
+ 	p->flags &= ~SWP_WRITEOK;
+@@ -1885,7 +1894,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ 	spin_unlock(&swap_lock);
+ 
+ 	set_current_oom_origin();
+-	err = try_to_unuse(type, false, 0); /* force all pages to be unused */
++	err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
+ 	clear_current_oom_origin();
+ 
+ 	if (err) {
+@@ -1926,7 +1935,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ 	frontswap_map = frontswap_map_get(p);
+ 	spin_unlock(&p->lock);
+ 	spin_unlock(&swap_lock);
+-	frontswap_invalidate_area(type);
++	frontswap_invalidate_area(p->type);
+ 	frontswap_map_set(p, NULL);
+ 	mutex_unlock(&swapon_mutex);
+ 	free_percpu(p->percpu_cluster);
+@@ -1935,7 +1944,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ 	vfree(cluster_info);
+ 	vfree(frontswap_map);
+ 	/* Destroy swap account information */
+-	swap_cgroup_swapoff(type);
++	swap_cgroup_swapoff(p->type);
+ 
+ 	inode = mapping->host;
+ 	if (S_ISBLK(inode->i_mode)) {
+@@ -2142,8 +2151,9 @@ static struct swap_info_struct *alloc_swap_info(void)
+ 		 */
+ 	}
+ 	INIT_LIST_HEAD(&p->first_swap_extent.list);
++	plist_node_init(&p->list, 0);
++	plist_node_init(&p->avail_list, 0);
+ 	p->flags = SWP_USED;
+-	p->next = -1;
+ 	spin_unlock(&swap_lock);
+ 	spin_lock_init(&p->lock);
+ 
+diff --git a/mm/vmacache.c b/mm/vmacache.c
+new file mode 100644
+index 000000000000..1037a3bab505
+--- /dev/null
++++ b/mm/vmacache.c
+@@ -0,0 +1,114 @@
++/*
++ * Copyright (C) 2014 Davidlohr Bueso.
++ */
++#include <linux/sched.h>
++#include <linux/mm.h>
++#include <linux/vmacache.h>
++
++/*
++ * Flush vma caches for threads that share a given mm.
++ *
++ * The operation is safe because the caller holds the mmap_sem
++ * exclusively and other threads accessing the vma cache will
++ * have mmap_sem held at least for read, so no extra locking
++ * is required to maintain the vma cache.
++ */
++void vmacache_flush_all(struct mm_struct *mm)
++{
++	struct task_struct *g, *p;
++
++	rcu_read_lock();
++	for_each_process_thread(g, p) {
++		/*
++		 * Only flush the vmacache pointers as the
++		 * mm seqnum is already set and curr's will
++		 * be set upon invalidation when the next
++		 * lookup is done.
++		 */
++		if (mm == p->mm)
++			vmacache_flush(p);
++	}
++	rcu_read_unlock();
++}
++
++/*
++ * This task may be accessing a foreign mm via (for example)
++ * get_user_pages()->find_vma().  The vmacache is task-local and this
++ * task's vmacache pertains to a different mm (ie, its own).  There is
++ * nothing we can do here.
++ *
++ * Also handle the case where a kernel thread has adopted this mm via use_mm().
++ * That kernel thread's vmacache is not applicable to this mm.
++ */
++static bool vmacache_valid_mm(struct mm_struct *mm)
++{
++	return current->mm == mm && !(current->flags & PF_KTHREAD);
++}
++
++void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
++{
++	if (vmacache_valid_mm(newvma->vm_mm))
++		current->vmacache[VMACACHE_HASH(addr)] = newvma;
++}
++
++static bool vmacache_valid(struct mm_struct *mm)
++{
++	struct task_struct *curr;
++
++	if (!vmacache_valid_mm(mm))
++		return false;
++
++	curr = current;
++	if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
++		/*
++		 * First attempt will always be invalid, initialize
++		 * the new cache for this task here.
++		 */
++		curr->vmacache_seqnum = mm->vmacache_seqnum;
++		vmacache_flush(curr);
++		return false;
++	}
++	return true;
++}
++
++struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
++{
++	int i;
++
++	if (!vmacache_valid(mm))
++		return NULL;
++
++	for (i = 0; i < VMACACHE_SIZE; i++) {
++		struct vm_area_struct *vma = current->vmacache[i];
++
++		if (!vma)
++			continue;
++		if (WARN_ON_ONCE(vma->vm_mm != mm))
++			break;
++		if (vma->vm_start <= addr && vma->vm_end > addr)
++			return vma;
++	}
++
++	return NULL;
++}
++
++#ifndef CONFIG_MMU
++struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
++					   unsigned long start,
++					   unsigned long end)
++{
++	int i;
++
++	if (!vmacache_valid(mm))
++		return NULL;
++
++	for (i = 0; i < VMACACHE_SIZE; i++) {
++		struct vm_area_struct *vma = current->vmacache[i];
++
++		if (vma && vma->vm_start == start && vma->vm_end == end)
++			return vma;
++	}
++
++	return NULL;
++}
++#endif
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index 6ef484f0777f..0c0b36e5b4f8 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -224,15 +224,15 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+ 	unsigned long freed = 0;
+ 	unsigned long long delta;
+ 	long total_scan;
+-	long max_pass;
++	long freeable;
+ 	long nr;
+ 	long new_nr;
+ 	int nid = shrinkctl->nid;
+ 	long batch_size = shrinker->batch ? shrinker->batch
+ 					  : SHRINK_BATCH;
+ 
+-	max_pass = shrinker->count_objects(shrinker, shrinkctl);
+-	if (max_pass == 0)
++	freeable = shrinker->count_objects(shrinker, shrinkctl);
++	if (freeable == 0)
+ 		return 0;
+ 
+ 	/*
+@@ -244,14 +244,14 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+ 
+ 	total_scan = nr;
+ 	delta = (4 * nr_pages_scanned) / shrinker->seeks;
+-	delta *= max_pass;
++	delta *= freeable;
+ 	do_div(delta, lru_pages + 1);
+ 	total_scan += delta;
+ 	if (total_scan < 0) {
+ 		printk(KERN_ERR
+ 		"shrink_slab: %pF negative objects to delete nr=%ld\n",
+ 		       shrinker->scan_objects, total_scan);
+-		total_scan = max_pass;
++		total_scan = freeable;
+ 	}
+ 
+ 	/*
+@@ -260,26 +260,26 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+ 	 * shrinkers to return -1 all the time. This results in a large
+ 	 * nr being built up so when a shrink that can do some work
+ 	 * comes along it empties the entire cache due to nr >>>
+-	 * max_pass.  This is bad for sustaining a working set in
++	 * freeable. This is bad for sustaining a working set in
+ 	 * memory.
+ 	 *
+ 	 * Hence only allow the shrinker to scan the entire cache when
+ 	 * a large delta change is calculated directly.
+ 	 */
+-	if (delta < max_pass / 4)
+-		total_scan = min(total_scan, max_pass / 2);
++	if (delta < freeable / 4)
++		total_scan = min(total_scan, freeable / 2);
+ 
+ 	/*
+ 	 * Avoid risking looping forever due to too large nr value:
+ 	 * never try to free more than twice the estimate number of
+ 	 * freeable entries.
+ 	 */
+-	if (total_scan > max_pass * 2)
+-		total_scan = max_pass * 2;
++	if (total_scan > freeable * 2)
++		total_scan = freeable * 2;
+ 
+ 	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
+ 				nr_pages_scanned, lru_pages,
+-				max_pass, delta, total_scan);
++				freeable, delta, total_scan);
+ 
+ 	/*
+ 	 * Normally, we should not scan less than batch_size objects in one
+@@ -292,12 +292,12 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+ 	 *
+ 	 * We detect the "tight on memory" situations by looking at the total
+ 	 * number of objects we want to scan (total_scan). If it is greater
+-	 * than the total number of objects on slab (max_pass), we must be
++	 * than the total number of objects on slab (freeable), we must be
+ 	 * scanning at high prio and therefore should try to reclaim as much as
+ 	 * possible.
+ 	 */
+ 	while (total_scan >= batch_size ||
+-	       total_scan >= max_pass) {
++	       total_scan >= freeable) {
+ 		unsigned long ret;
+ 		unsigned long nr_to_scan = min(batch_size, total_scan);
+ 
+@@ -1144,7 +1144,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+ 			TTU_UNMAP|TTU_IGNORE_ACCESS,
+ 			&dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
+ 	list_splice(&clean_pages, page_list);
+-	__mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
++	mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+ 	return ret;
+ }
+ 
+@@ -2424,8 +2424,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
+ 			unsigned long lru_pages = 0;
+ 
+ 			nodes_clear(shrink->nodes_to_scan);
+-			for_each_zone_zonelist(zone, z, zonelist,
+-					gfp_zone(sc->gfp_mask)) {
++			for_each_zone_zonelist_nodemask(zone, z, zonelist,
++					gfp_zone(sc->gfp_mask), sc->nodemask) {
+ 				if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ 					continue;
+ 


             reply	other threads:[~2014-10-09 23:03 UTC|newest]

Thread overview: 85+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-10-09 23:03 Mike Pagano [this message]
  -- strict thread matches above, loose matches on Subject: below --
2016-12-11 22:31 [gentoo-commits] proj/linux-patches:3.14 commit in: / Mike Pagano
2016-09-11 17:39 Mike Pagano
2016-09-09 19:22 Mike Pagano
2016-08-20 16:29 Mike Pagano
2016-08-17 12:18 Mike Pagano
2016-08-10 12:53 Mike Pagano
2016-07-27 19:15 Mike Pagano
2016-06-24 20:37 Mike Pagano
2016-06-08 11:21 Mike Pagano
2016-06-02 18:01 Mike Pagano
2016-05-19 12:38 Mike Pagano
2016-05-12  0:07 Mike Pagano
2016-05-04 23:46 Mike Pagano
2016-04-20 10:10 Mike Pagano
2016-04-12 19:01 Mike Pagano
2016-03-16 19:41 Mike Pagano
2016-03-10  0:49 Mike Pagano
2016-03-04  0:16 Mike Pagano
2016-02-25 23:29 Mike Pagano
2016-02-17 23:58 Mike Pagano
2016-01-31 21:34 Mike Pagano
2016-01-23 18:58 Mike Pagano
2016-01-20 15:13 Mike Pagano
2015-12-10 13:52 Mike Pagano
2015-11-10  0:05 Mike Pagano
2015-10-27 13:38 Mike Pagano
2015-10-23 19:40 Mike Pagano
2015-10-01 13:18 Mike Pagano
2015-09-21 17:37 Mike Pagano
2015-09-14 16:23 Mike Pagano
2015-08-17 16:37 Mike Pagano
2015-08-10 23:13 Mike Pagano
2015-08-03 22:33 Mike Pagano
2015-07-17 15:34 Mike Pagano
2015-07-10 23:40 Mike Pagano
2015-07-07  0:44 Mike Pagano
2015-06-30 14:34 Mike Pagano
2015-06-23 17:10 Mike Pagano
2015-06-06 21:34 Mike Pagano
2015-05-18 19:33 Mike Pagano
2015-05-13 19:23 Mike Pagano
2015-05-08 12:14 Mike Pagano
2015-04-29 17:04 Mike Pagano
2015-04-20  9:42 Mike Pagano
2015-04-14  9:50 Mike Pagano
2015-03-28 20:25 Mike Pagano
2015-03-26 20:52 Mike Pagano
2015-03-19 12:42 Mike Pagano
2015-03-07 14:45 Mike Pagano
2015-02-27 14:34 Mike Pagano
2015-02-14 21:11 Mike Pagano
2015-02-11 15:16 Mike Pagano
2015-02-07  1:28 Mike Pagano
2015-01-30 11:12 Mike Pagano
2015-01-28 22:16 Anthony G. Basile
2015-01-28 22:01 Anthony G. Basile
2015-01-17  0:55 Mike Pagano
2015-01-09 18:28 Mike Pagano
2015-01-09 16:18 Mike Pagano
2015-01-02 19:10 Mike Pagano
2014-12-16 20:29 Mike Pagano
2014-12-09 23:03 Mike Pagano
2014-11-23 12:07 Anthony G. Basile
2014-11-22 20:16 Mike Pagano
2014-11-15  0:32 Mike Pagano
2014-10-30 22:56 Mike Pagano
2014-10-30 22:42 Mike Pagano
2014-10-15 15:43 Mike Pagano
2014-10-06 15:44 Mike Pagano
2014-09-17 19:59 Anthony G. Basile
2014-09-09 22:16 Vlastimil Babka
2014-08-19 11:44 Mike Pagano
2014-08-08 18:30 ` Mike Pagano
2014-08-14 12:44 Mike Pagano
2014-08-19 11:44 ` Mike Pagano
2014-08-02  0:19 Mike Pagano
2014-08-19 11:44 ` Mike Pagano
2014-07-28 19:17 Mike Pagano
2014-08-19 11:44 ` Mike Pagano
2014-07-18 12:05 Mike Pagano
2014-07-09 23:09 Mike Pagano
2014-07-08 18:04 Mike Pagano
2014-07-01 12:08 Mike Pagano
2014-06-27 15:00 Mike Pagano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1412895805.3854e739b3925782361de3d4306ab04a5318b1b5.mpagano@gentoo \
    --to=mpagano@gentoo.org \
    --cc=gentoo-commits@lists.gentoo.org \
    --cc=gentoo-dev@lists.gentoo.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox